<a href="https://colab.research.google.com/github/mehrnazeraeisi/Best-Features-Group-Results-on-KNN/blob/main/knn_auto_evaluator_fast_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def knn_auto_evaluator_fast_save(X_data, y_data,
                                 test_sizes=[0.1, 0.13, 0.16 , 0.19, 0.25],
                                 n_repeats=5,
                                 output_file='knn_results.csv'):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
    from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
    import warnings
    warnings.filterwarnings("ignore")

    # تشخیص نوع هدف
    if pd.api.types.is_numeric_dtype(y_data):
        target_type = 'regression' if len(np.unique(y_data)) > 10 else 'classification'
    else:
        target_type = 'classification'

    feature_names = X_data.columns.tolist()
    results = []

    for ts in test_sizes:
        for repeat in range(1, n_repeats+1):
            X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=ts)

            # امتیاز تک ویژگی‌ها روی داده تست (سریع‌تر و واقعی‌تر)
            scores = []
            for feature in feature_names:
                model = KNeighborsRegressor(n_jobs=-1) if target_type == 'regression' else KNeighborsClassifier(n_jobs=-1)
                model.fit(X_train[[feature]], y_train)
                y_pred = model.predict(X_test[[feature]])
                score = r2_score(y_test, y_pred) if target_type == 'regression' else accuracy_score(y_test, y_pred)
                scores.append((feature, score))

            df_scores = pd.DataFrame(scores, columns=['Feature', 'Score']).sort_values(by='Score', ascending=False)

            # یافتن بهترین گروه ویژگی‌ها (توقف زودهنگام)
            best_score = -np.inf
            best_group = []
            no_improve_count = 0
            for k in range(1, len(df_scores)+1):
                group = df_scores['Feature'][:k].tolist()
                model = KNeighborsRegressor(n_jobs=-1) if target_type == 'regression' else KNeighborsClassifier(n_jobs=-1)
                model.fit(X_train[group], y_train)
                y_pred = model.predict(X_test[group])
                score = r2_score(y_test, y_pred) if target_type == 'regression' else accuracy_score(y_test, y_pred)

                if score > best_score:
                    best_score = score
                    best_group = group
                    no_improve_count = 0
                else:
                    no_improve_count += 1
                    if no_improve_count >= 3:
                        break

            # آموزش مدل روی کل ویژگی‌ها و بهترین گروه
            model_all = KNeighborsRegressor(n_jobs=-1) if target_type == 'regression' else KNeighborsClassifier(n_jobs=-1)
            model_best = KNeighborsRegressor(n_jobs=-1) if target_type == 'regression' else KNeighborsClassifier(n_jobs=-1)
            model_all.fit(X_train, y_train)
            model_best.fit(X_train[best_group], y_train)

            y_pred_all = model_all.predict(X_test)
            y_pred_best = model_best.predict(X_test[best_group])

            # ثبت نتایج
            if target_type == 'regression':
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_R2': r2_score(y_test, y_pred_all),
                    'All_MSE': mean_squared_error(y_test, y_pred_all),
                    'BestGroup_R2': r2_score(y_test, y_pred_best),
                    'BestGroup_MSE': mean_squared_error(y_test, y_pred_best),
                }
                column_order = ['Test Size', 'Repeat', 'All_R2', 'All_MSE', 'BestGroup_R2', 'BestGroup_MSE']
            else:
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_Accuracy': accuracy_score(y_test, y_pred_all),
                    'BestGroup_Accuracy': accuracy_score(y_test, y_pred_best),
                }
                column_order = ['Test Size', 'Repeat', 'All_Accuracy', 'BestGroup_Accuracy']

            results.append(result_row)

    # ذخیره نتایج در فایل CSV
    final_df = pd.DataFrame(results)[column_order]
    final_df.to_csv(output_file, index=False)
    print(f"✅ نتایج با موفقیت در فایل «{output_file}» ذخیره شدند.")

    return final_df


In [3]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [4]:
import openml
import pandas as pd

# دانلود دیتاست
dataset = openml.datasets.get_dataset(43977)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# افزودن ستون هدف به دیتافریم کامل
data['rating'] = y_data  # فرض بر این است که ستون هدف، rating است

# جدا کردن ویژگی‌ها و برچسب‌ها
X_data = data.drop('rating', axis=1)
y_data = data['rating']

print(X_data.shape)

# اجرای تابع روی دیتاست شما
result_df = knn_auto_evaluator_fast_save(X_data, y_data)
print(result_df)



(57580, 54)
✅ نتایج با موفقیت در فایل «knn_results.csv» ذخیره شدند.
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy
0        0.10       1      0.644147            0.720563
1        0.10       2      0.642758            0.705974
2        0.10       3      0.642584            0.719173
3        0.10       4      0.636332            0.658562
4        0.10       5      0.637721            0.712226
5        0.13       1      0.643201            0.715736
6        0.13       2      0.634651            0.707454
7        0.13       3      0.633583            0.711729
8        0.13       4      0.641464            0.716270
9        0.13       5      0.646273            0.700775
10       0.16       1      0.638880            0.718333
11       0.16       2      0.639857            0.692500
12       0.16       3      0.631933            0.715619
13       0.16       4      0.639097            0.720612
14       0.16       5      0.635298            0.711712
15       0.19       1      0.632026 