<a href="https://colab.research.google.com/github/mehrnazeraeisi/Best-Features-Group-Results-on-KNN/blob/main/Brief_of_Best_Features_Group_Results_on_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def knn_auto_evaluator_from_xy(X_data, y_data, test_sizes=[0.1, 0.13, 0.16 , 0.19, 0.25], n_repeats=20):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
    from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
    import warnings
    warnings.filterwarnings("ignore")

    # تشخیص نوع تارگت
    if pd.api.types.is_numeric_dtype(y_data):
        target_type = 'regression' if len(np.unique(y_data)) > 10 else 'classification'
    else:
        target_type = 'classification'

    feature_names = X_data.columns.tolist()
    results = []

    for ts in test_sizes:
        for repeat in range(1, n_repeats+1):
            X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=ts)

            scores = []
            for feature in feature_names:
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[[feature]], y_train)
                y_pred = model.predict(X_train[[feature]])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                scores.append((feature, score))

            df_scores = pd.DataFrame(scores, columns=['Feature', 'Score']).sort_values(by='Score', ascending=False)

            best_score = -np.inf
            best_group = []
            for k in range(1, len(df_scores)+1):
                group = df_scores['Feature'][:k].tolist()
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[group], y_train)
                y_pred = model.predict(X_train[group])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                if score > best_score:
                    best_score = score
                    best_group = group

            model_all = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_best = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_all.fit(X_train, y_train)
            model_best.fit(X_train[best_group], y_train)
            y_pred_all = model_all.predict(X_test)
            y_pred_best = model_best.predict(X_test[best_group])

            if target_type == 'regression':
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_R2': r2_score(y_test, y_pred_all),
                    'All_MSE': mean_squared_error(y_test, y_pred_all),
                    'BestGroup_R2': r2_score(y_test, y_pred_best),
                    'BestGroup_MSE': mean_squared_error(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_R2', 'All_MSE', 'BestGroup_R2', 'BestGroup_MSE']
            else:
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_Accuracy': accuracy_score(y_test, y_pred_all),
                    'BestGroup_Accuracy': accuracy_score(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_Accuracy', 'BestGroup_Accuracy']

            results.append(result_row)

    # ساخت DataFrame نهایی با ترتیب دقیق ستون‌ها
    return pd.DataFrame(results)[column_order]


In [2]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import os

folder_path = '/content/drive/MyDrive/my_csv_files'

# لیست تمام فایل‌های CSV و Excel
files = os.listdir(folder_path)

dataframes = []

for file in files:
    file_path = os.path.join(folder_path, file)

    if file.endswith('.csv'):
        df = pd.read_csv(file_path)
        print(f"Loaded CSV: {file}")

    elif file.endswith('.xlsx') or file.endswith('.xls'):
        df = pd.read_excel(file_path)
        print(f"Loaded Excel: {file}")

    else:
        print(f"Skipped unsupported file: {file}")
        continue

    dataframes.append(df)


Loaded CSV: advertising.csv
Loaded CSV: bodyfat.csv
Loaded CSV: cookie_recipes.csv
Loaded Excel: Pistachio.xlsx
Loaded CSV: BostonHousing.csv


In [5]:
results = []

In [19]:
import pandas as pd


# مسیر فایل در گوگل درایو
file_path = '/content/drive/MyDrive/my_csv_files/bodyfat.csv'

# بارگذاری فایل
data = pd.read_csv(file_path)

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Pct.BF', axis=1)
y = data['Pct.BF']

print(X.shape)


df_Bodyfat_results = knn_auto_evaluator_from_xy(X, y)
print(df_Bodyfat_results)

results.append({'name': 'Bodyfat', 'type': 'regression', 'table': df_Bodyfat_results})

(250, 14)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.672261  23.932112      0.648736      25.649936
1        0.10       2  0.656726  27.487952      0.688599      24.935712
2        0.10       3  0.453015  44.851824      0.514638      39.798848
3        0.10       4  0.620723  32.257344      0.602108      33.840560
4        0.10       5  0.665487  25.714144      0.613394      29.718544
..        ...     ...       ...        ...           ...            ...
95       0.25      16  0.577758  24.113035      0.556896      25.304425
96       0.25      17  0.546715  33.398717      0.610492      28.699524
97       0.25      18  0.567203  33.290298      0.610178      29.984667
98       0.25      19  0.581678  25.716057      0.627029      22.928190
99       0.25      20  0.543140  35.181657      0.601020      30.724502

[100 rows x 6 columns]


In [21]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/advertising.csv')

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('sales', axis=1)
y= data['sales']

print(X.shape)

df_Advertising_results = knn_auto_evaluator_from_xy(X, y)
print(df_Advertising_results)

results.append({'name': 'Advertising', 'type': 'regression', 'table': df_Advertising_results})

(200, 3)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.932083  2.022200      0.958217       1.244060
1        0.10       2  0.942352  1.429780      0.972173       0.690160
2        0.10       3  0.903030  2.149040      0.940896       1.309860
3        0.10       4  0.848911  3.910180      0.957263       1.106040
4        0.10       5  0.906568  1.241200      0.935707       0.854100
..        ...     ...       ...       ...           ...            ...
95       0.25      16  0.912485  2.149544      0.944995       1.351032
96       0.25      17  0.932980  1.524952      0.964343       0.811328
97       0.25      18  0.915385  2.551760      0.963523       1.100056
98       0.25      19  0.909186  2.225424      0.952867       1.155016
99       0.25      20  0.915935  2.249944      0.959286       1.089680

[100 rows x 6 columns]


In [20]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/BostonHousing.csv')



# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('medv', axis=1)
y= data['medv']

print(X.shape)

df_BostonHousing_results = knn_auto_evaluator_from_xy(X, y)
print(df_BostonHousing_results)
results.append({'name': 'BostonHousing', 'type': 'regression', 'table': df_BostonHousing_results})

(506, 13)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.595214  39.740518      0.856012      14.136282
1        0.10       2  0.456855  28.770471      0.701213      15.826792
2        0.10       3  0.460511  40.062149      0.748068      18.708345
3        0.10       4  0.636539  39.552557      0.710826      31.468533
4        0.10       5  0.617703  22.359553      0.754891      14.335796
..        ...     ...       ...        ...           ...            ...
95       0.25      16  0.500734  35.536450      0.812201      13.367036
96       0.25      17  0.553430  27.674274      0.705900      18.225631
97       0.25      18  0.601322  36.390854      0.748085      22.994441
98       0.25      19  0.654872  26.228283      0.744245      19.436331
99       0.25      20  0.525003  42.503518      0.761243      21.364400

[100 rows x 6 columns]


In [22]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/cookie_recipes.csv')


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y= data['rating']

print(X.shape)

df_Cookie_Recipes_results = knn_auto_evaluator_from_xy(X, y)
print(df_Cookie_Recipes_results)

results.append({'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_Cookie_Recipes_results})

(50, 5)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.699148  0.883000      0.938160       0.181500
1        0.10       2 -0.645098  2.097500      0.972157       0.035500
2        0.10       3 -0.731579  1.151500      0.950376       0.033000
3        0.10       4 -0.589344  0.969500      0.800000       0.122000
4        0.10       5  0.059459  1.044000      0.977477       0.025000
..        ...     ...       ...       ...           ...            ...
95       0.25      16  0.035936  2.263269      0.916528       0.195962
96       0.25      17 -0.297138  1.045769      0.926771       0.059038
97       0.25      18  0.191768  0.946923      0.966679       0.039038
98       0.25      19  0.212676  1.653846      0.940585       0.124808
99       0.25      20 -1.098749  1.463846      0.908187       0.064038

[100 rows x 6 columns]


In [23]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
X = data.data
y = data.target
print(X.shape)


df_wine_results = knn_auto_evaluator_from_xy(X, y)
print(df_wine_results)

results.append({'name': 'wine', 'type': 'classification', 'table': df_wine_results})

(178, 13)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy
0        0.10       1      0.777778            0.833333
1        0.10       2      0.888889            0.944444
2        0.10       3      0.777778            0.833333
3        0.10       4      0.722222            0.722222
4        0.10       5      0.666667            0.944444
..        ...     ...           ...                 ...
95       0.25      16      0.733333            0.866667
96       0.25      17      0.733333            0.977778
97       0.25      18      0.688889            0.777778
98       0.25      19      0.688889            0.800000
99       0.25      20      0.666667            0.866667

[100 rows x 4 columns]


In [24]:
import pandas as pd

# بارگذاری فایل اکسل
data = pd.read_excel('/content/drive/MyDrive/my_csv_files/Pistachio.xlsx')
# حذف سطرهای حاوی NaN
data = data.dropna()


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Class', axis=1)
y = data['Class']

print(data.shape)


df_Pistachio_results = knn_auto_evaluator_from_xy(X, y)
print(df_Pistachio_results)

results.append({'name': 'Pistachio', 'type': 'classification', 'table': df_Pistachio_results})

(2148, 17)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy
0        0.10       1      0.809302            0.800000
1        0.10       2      0.730233            0.841860
2        0.10       3      0.753488            0.832558
3        0.10       4      0.772093            0.823256
4        0.10       5      0.748837            0.813953
..        ...     ...           ...                 ...
95       0.25      16      0.746741            0.806331
96       0.25      17      0.748603            0.836127
97       0.25      18      0.713222            0.817505
98       0.25      19      0.756052            0.813780
99       0.25      20      0.739292            0.826816

[100 rows x 4 columns]


In [None]:
# Climate Model Simulation
# شبیه‌سازی آب‌وهوا (دوتایی)
import openml
import pandas as pd

dataset = openml.datasets.get_dataset(40966)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()


print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']


df_Climate_results = knn_auto_evaluator_from_xy(X, y)
print(df_Climate_results)

results.append({'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_Climate_results})

(1080, 78)
(552, 78)


In [None]:
import pandas as pd

def summarize_max_diff_results(dataset_tables):
    """
    dataset_tables: لیستی از دیکشنری‌ها به شکل زیر:
    [
        {'name': 'Diabetes', 'type': 'regression', 'table': df1},
        {'name': 'Wine', 'type': 'classification', 'table': df2},
        ...
    ]

    خروجی: DataFrame خلاصه از بیشترین اختلاف برای هر دیتاست
    """
    summary_rows = []

    for item in dataset_tables:
        name = item['name']
        dtype = item['type']
        df = item['table']

        if dtype == 'regression':
            # محاسبه بیشترین اختلاف MSE
            df['MSE_Diff'] = df['All_MSE'] - df['BestGroup_MSE']
            best_row = df.loc[df['MSE_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': None,
                'BestGroup_Accuracy': None,
                'All_R2': best_row['All_R2'],
                'BestGroup_R2': best_row['BestGroup_R2'],
                'All_MSE': best_row['All_MSE'],
                'BestGroup_MSE': best_row['BestGroup_MSE']
            })

        elif dtype == 'classification':
            # محاسبه بیشترین اختلاف دقت
            df['Accuracy_Diff'] = df['BestGroup_Accuracy'] - df['All_Accuracy']
            best_row = df.loc[df['Accuracy_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': best_row['All_Accuracy'],
                'BestGroup_Accuracy': best_row['BestGroup_Accuracy'],
                'All_R2': None,
                'BestGroup_R2': None,
                'All_MSE': None,
                'BestGroup_MSE': None
            })

    return pd.DataFrame(summary_rows)


In [15]:
results = [
    {'name': 'Bodyfat', 'type': 'regression', 'table': df_Bodyfat_results},
    {'name': 'Advertising', 'type': 'regression', 'table': df_Advertising_results},
    {'name': 'BostonHousing', 'type': 'regression', 'table': df_BostonHousing_results},
    {'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_Cookie_Recipes_results},
    #{'name': 'California_Housing', 'type': 'regression', 'table': df_California_Housing_results},
    {'name': 'wine', 'type': 'classification', 'table': df_wine_results},
    #{'name': 'Jannis(43977)', 'type': 'classification', 'table': df_Jannis_results},
    {'name': 'Pistachio', 'type': 'classification', 'table': df_Pistachio_results},
    #{'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_MagicTelescope_results},
    #{'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_Noise_results},
    {'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_Climate_results}

]

summary = summarize_max_diff_results(results)
print(summary)


KeyError: 'All_MSE'