<a href="https://colab.research.google.com/github/mehrnazeraeisi/Best-Features-Group-Results-on-KNN/blob/main/Brief_of_Best_Features_Group_Results_on_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def knn_auto_evaluator_from_xy(X_data, y_data, test_sizes=[0.1, 0.13, 0.16 , 0.19, 0.25], n_repeats=20):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
    from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
    import warnings
    warnings.filterwarnings("ignore")

    # تشخیص نوع تارگت
    if pd.api.types.is_numeric_dtype(y_data):
        target_type = 'regression' if len(np.unique(y_data)) > 10 else 'classification'
    else:
        target_type = 'classification'

    feature_names = X_data.columns.tolist()
    results = []

    for ts in test_sizes:
        for repeat in range(1, n_repeats+1):
            X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=ts)

            scores = []
            for feature in feature_names:
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[[feature]], y_train)
                y_pred = model.predict(X_train[[feature]])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                scores.append((feature, score))

            df_scores = pd.DataFrame(scores, columns=['Feature', 'Score']).sort_values(by='Score', ascending=False)

            best_score = -np.inf
            best_group = []
            for k in range(1, len(df_scores)+1):
                group = df_scores['Feature'][:k].tolist()
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[group], y_train)
                y_pred = model.predict(X_train[group])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                if score > best_score:
                    best_score = score
                    best_group = group

            model_all = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_best = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_all.fit(X_train, y_train)
            model_best.fit(X_train[best_group], y_train)
            y_pred_all = model_all.predict(X_test)
            y_pred_best = model_best.predict(X_test[best_group])

            if target_type == 'regression':
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_R2': r2_score(y_test, y_pred_all),
                    'All_MSE': mean_squared_error(y_test, y_pred_all),
                    'BestGroup_R2': r2_score(y_test, y_pred_best),
                    'BestGroup_MSE': mean_squared_error(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_R2', 'All_MSE', 'BestGroup_R2', 'BestGroup_MSE']
            else:
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_Accuracy': accuracy_score(y_test, y_pred_all),
                    'BestGroup_Accuracy': accuracy_score(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_Accuracy', 'BestGroup_Accuracy']

            results.append(result_row)

    # ساخت DataFrame نهایی با ترتیب دقیق ستون‌ها
    return pd.DataFrame(results)[column_order]


In [2]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import os

folder_path = '/content/drive/MyDrive/my_csv_files'

# لیست تمام فایل‌های CSV و Excel
files = os.listdir(folder_path)

dataframes = []

for file in files:
    file_path = os.path.join(folder_path, file)

    if file.endswith('.csv'):
        df = pd.read_csv(file_path)
        print(f"Loaded CSV: {file}")

    elif file.endswith('.xlsx') or file.endswith('.xls'):
        df = pd.read_excel(file_path)
        print(f"Loaded Excel: {file}")

    else:
        print(f"Skipped unsupported file: {file}")
        continue

    dataframes.append(df)


Loaded CSV: advertising.csv
Loaded CSV: bodyfat.csv
Loaded CSV: cookie_recipes.csv
Loaded Excel: Pistachio.xlsx
Loaded CSV: BostonHousing.csv


In [5]:
results = []

In [6]:
import pandas as pd


# مسیر فایل در گوگل درایو
file_path = '/content/drive/MyDrive/my_csv_files/bodyfat.csv'

# بارگذاری فایل
data = pd.read_csv(file_path)

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Pct.BF', axis=1)
y = data['Pct.BF']

print(X.shape)


df_Bodyfat_results = knn_auto_evaluator_from_xy(X, y)
print(df_Bodyfat_results)

results.append({'name': 'Bodyfat', 'type': 'regression', 'table': df_Bodyfat_results})

(250, 14)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.522787  29.258096      0.581620      25.650976
1        0.10       2  0.572516  25.223776      0.615582      22.682640
2        0.10       3  0.640349  23.198112      0.553816      28.779600
3        0.10       4  0.640575  26.811072      0.584192      31.016912
4        0.10       5  0.631425  28.715456      0.697550      23.563712
..        ...     ...       ...        ...           ...            ...
95       0.25      16  0.412500  30.782514      0.340232      34.569060
96       0.25      17  0.527233  31.723746      0.509410      32.919670
97       0.25      18  0.632625  19.268254      0.636068      19.087676
98       0.25      19  0.269226  29.696349      0.272716      29.554508
99       0.25      20  0.541589  26.770260      0.610877      22.724013

[100 rows x 6 columns]


In [7]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/advertising.csv')

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('sales', axis=1)
y= data['sales']

print(X.shape)

df_Advertising_results = knn_auto_evaluator_from_xy(X, y)
print(df_Advertising_results)

results.append({'name': 'Advertising', 'type': 'regression', 'table': df_Advertising_results})

(200, 3)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.944540  0.874840      0.961558       0.606400
1        0.10       2  0.816323  4.605060      0.968465       0.790640
2        0.10       3  0.826077  2.950420      0.975824       0.410120
3        0.10       4  0.905397  2.213360      0.951466       1.135520
4        0.10       5  0.940379  1.278360      0.973561       0.566900
..        ...     ...       ...       ...           ...            ...
95       0.25      16  0.967203  0.722024      0.971466       0.628168
96       0.25      17  0.928896  1.638240      0.954124       1.057000
97       0.25      18  0.922750  2.443072      0.936525       2.007456
98       0.25      19  0.914745  3.136296      0.933167       2.458592
99       0.25      20  0.912544  2.478360      0.965604       0.974720

[100 rows x 6 columns]


In [8]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/BostonHousing.csv')



# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('medv', axis=1)
y= data['medv']

print(X.shape)

df_BostonHousing_results = knn_auto_evaluator_from_xy(X, y)
print(df_BostonHousing_results)
results.append({'name': 'BostonHousing', 'type': 'regression', 'table': df_BostonHousing_results})

(506, 13)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.438163  69.407263      0.872576      15.741537
1        0.10       2  0.231464  54.747725      0.647250      25.128667
2        0.10       3  0.621308  18.552690      0.756002      11.953827
3        0.10       4  0.483358  53.291631      0.716311      29.262478
4        0.10       5  0.273781  53.921961      0.837097      12.095624
..        ...     ...       ...        ...           ...            ...
95       0.25      16  0.534659  37.824060      0.786594      17.346198
96       0.25      17  0.518758  41.483033      0.747003      21.808328
97       0.25      18  0.431339  47.894164      0.767876      19.550076
98       0.25      19  0.505033  33.985717      0.787893      14.563808
99       0.25      20  0.644680  31.895786      0.876554      11.081285

[100 rows x 6 columns]


In [9]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/cookie_recipes.csv')


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y= data['rating']

print(X.shape)

df_Cookie_Recipes_results = knn_auto_evaluator_from_xy(X, y)
print(df_Cookie_Recipes_results)

results.append({'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_Cookie_Recipes_results})

(50, 5)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1 -0.132727  0.311500      0.925455       0.020500
1        0.10       2  0.103448  0.260000      0.953448       0.013500
2        0.10       3  0.077742  1.429500      0.839355       0.249000
3        0.10       4 -0.273846  0.414000      0.738462       0.085000
4        0.10       5  0.228375  1.400500      0.963361       0.066500
..        ...     ...       ...       ...           ...            ...
95       0.25      16  0.419725  0.655385      0.923209       0.086731
96       0.25      17  0.516405  0.342308      0.897304       0.072692
97       0.25      18 -0.046906  1.078654      0.874386       0.129423
98       0.25      19  0.137774  1.232115      0.950072       0.071346
99       0.25      20 -0.401383  2.203654      0.913415       0.136154

[100 rows x 6 columns]


In [10]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
X = data.data
y = data.target
print(X.shape)


df_wine_results = knn_auto_evaluator_from_xy(X, y)
print(df_wine_results)

results.append({'name': 'wine', 'type': 'classification', 'table': df_wine_results})

(178, 13)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy
0        0.10       1      0.666667            0.777778
1        0.10       2      0.666667            0.777778
2        0.10       3      0.722222            0.944444
3        0.10       4      0.722222            0.777778
4        0.10       5      0.833333            0.944444
..        ...     ...           ...                 ...
95       0.25      16      0.800000            0.888889
96       0.25      17      0.755556            0.911111
97       0.25      18      0.644444            0.755556
98       0.25      19      0.688889            0.911111
99       0.25      20      0.688889            0.911111

[100 rows x 4 columns]


In [11]:
import pandas as pd

# بارگذاری فایل اکسل
data = pd.read_excel('/content/drive/MyDrive/my_csv_files/Pistachio.xlsx')
# حذف سطرهای حاوی NaN
data = data.dropna()


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Class', axis=1)
y = data['Class']

print(data.shape)


df_Pistachio_results = knn_auto_evaluator_from_xy(X, y)
print(df_Pistachio_results)

results.append({'name': 'Pistachio', 'type': 'classification', 'table': df_Pistachio_results})

(2148, 17)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy
0        0.10       1      0.739535            0.818605
1        0.10       2      0.758140            0.841860
2        0.10       3      0.744186            0.809302
3        0.10       4      0.744186            0.781395
4        0.10       5      0.748837            0.804651
..        ...     ...           ...                 ...
95       0.25      16      0.750466            0.849162
96       0.25      17      0.763501            0.826816
97       0.25      18      0.761639            0.836127
98       0.25      19      0.769088            0.849162
99       0.25      20      0.741155            0.832402

[100 rows x 4 columns]


In [12]:
import pandas as pd

def summarize_max_diff_results(dataset_tables):
    """
    dataset_tables: لیستی از دیکشنری‌ها به شکل زیر:
    [
        {'name': 'Diabetes', 'type': 'regression', 'table': df1},
        {'name': 'Wine', 'type': 'classification', 'table': df2},
        ...
    ]

    خروجی: DataFrame خلاصه از بیشترین اختلاف برای هر دیتاست
    """
    summary_rows = []

    for item in dataset_tables:
        name = item['name']
        dtype = item['type']
        df = item['table']

        if dtype == 'regression':
            # محاسبه بیشترین اختلاف MSE
            df['MSE_Diff'] = df['All_MSE'] - df['BestGroup_MSE']
            best_row = df.loc[df['MSE_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': None,
                'BestGroup_Accuracy': None,
                'All_R2': best_row['All_R2'],
                'BestGroup_R2': best_row['BestGroup_R2'],
                'All_MSE': best_row['All_MSE'],
                'BestGroup_MSE': best_row['BestGroup_MSE']
            })

        elif dtype == 'classification':
            # محاسبه بیشترین اختلاف دقت
            df['Accuracy_Diff'] = df['BestGroup_Accuracy'] - df['All_Accuracy']
            best_row = df.loc[df['Accuracy_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': best_row['All_Accuracy'],
                'BestGroup_Accuracy': best_row['BestGroup_Accuracy'],
                'All_R2': None,
                'BestGroup_R2': None,
                'All_MSE': None,
                'BestGroup_MSE': None
            })

    return pd.DataFrame(summary_rows)


In [13]:
results = [
    {'name': 'Bodyfat', 'type': 'regression', 'table': df_Bodyfat_results},
    {'name': 'Advertising', 'type': 'regression', 'table': df_Advertising_results},
    {'name': 'BostonHousing', 'type': 'regression', 'table': df_BostonHousing_results},
    {'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_Cookie_Recipes_results},
    #{'name': 'California_Housing', 'type': 'regression', 'table': df_California_Housing_results},
    {'name': 'wine', 'type': 'classification', 'table': df_wine_results},
    #{'name': 'Jannis(43977)', 'type': 'classification', 'table': df_Jannis_results},
    {'name': 'Pistachio', 'type': 'classification', 'table': df_Pistachio_results},
    #{'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_MagicTelescope_results},
    #{'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_Noise_results},
    #{'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_Climate_results}

]

summary = summarize_max_diff_results(results)
print(summary)


          Dataset            Type  All_Accuracy  BestGroup_Accuracy    All_R2  \
0         Bodyfat      regression           NaN                 NaN  0.568511   
1     Advertising      regression           NaN                 NaN  0.805231   
2   BostonHousing      regression           NaN                 NaN  0.438163   
3  Cookie_Recipes      regression           NaN                 NaN -0.822476   
4            wine  classification      0.611111            1.000000       NaN   
5       Pistachio  classification      0.738386            0.858191       NaN   

   BestGroup_R2    All_MSE  BestGroup_MSE  
0      0.695741  39.695248      27.990608  
1      0.936763   6.526292       2.118938  
2      0.872576  69.407263      15.741537  
3      0.958632   2.797500       0.063500  
4           NaN        NaN            NaN  
5           NaN        NaN            NaN  
