<a href="https://colab.research.google.com/github/mehrnazeraeisi/Best-Features-Group-Results-on-KNN/blob/main/Best_Features_Group_Results_on_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def knn_auto_evaluator_from_xy(X_data, y_data, test_sizes=[0.1, 0.13, 0.16 , 0.19, 0.25], n_repeats=20):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
    from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
    import warnings
    warnings.filterwarnings("ignore")

    # تشخیص نوع تارگت
    if pd.api.types.is_numeric_dtype(y_data):
        target_type = 'regression' if len(np.unique(y_data)) > 10 else 'classification'
    else:
        target_type = 'classification'

    feature_names = X_data.columns.tolist()
    results = []

    for ts in test_sizes:
        for repeat in range(1, n_repeats+1):
            X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=ts)

            scores = []
            for feature in feature_names:
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[[feature]], y_train)
                y_pred = model.predict(X_train[[feature]])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                scores.append((feature, score))

            df_scores = pd.DataFrame(scores, columns=['Feature', 'Score']).sort_values(by='Score', ascending=False)

            best_score = -np.inf
            best_group = []
            for k in range(1, len(df_scores)+1):
                group = df_scores['Feature'][:k].tolist()
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[group], y_train)
                y_pred = model.predict(X_train[group])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                if score > best_score:
                    best_score = score
                    best_group = group

            model_all = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_best = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_all.fit(X_train, y_train)
            model_best.fit(X_train[best_group], y_train)
            y_pred_all = model_all.predict(X_test)
            y_pred_best = model_best.predict(X_test[best_group])

            if target_type == 'regression':
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_R2': r2_score(y_test, y_pred_all),
                    'All_MSE': mean_squared_error(y_test, y_pred_all),
                    'BestGroup_R2': r2_score(y_test, y_pred_best),
                    'BestGroup_MSE': mean_squared_error(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_R2', 'All_MSE', 'BestGroup_R2', 'BestGroup_MSE']
            else:
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_Accuracy': accuracy_score(y_test, y_pred_all),
                    'BestGroup_Accuracy': accuracy_score(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_Accuracy', 'BestGroup_Accuracy']

            results.append(result_row)

    # ساخت DataFrame نهایی با ترتیب دقیق ستون‌ها
    return pd.DataFrame(results)[column_order]


In [2]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import os

folder_path = '/content/drive/MyDrive/my_csv_files'

# لیست تمام فایل‌های CSV و Excel
files = os.listdir(folder_path)

dataframes = []

for file in files:
    file_path = os.path.join(folder_path, file)

    if file.endswith('.csv'):
        df = pd.read_csv(file_path)
        print(f"Loaded CSV: {file}")

    elif file.endswith('.xlsx') or file.endswith('.xls'):
        df = pd.read_excel(file_path)
        print(f"Loaded Excel: {file}")

    else:
        print(f"Skipped unsupported file: {file}")
        continue

    dataframes.append(df)


Loaded CSV: advertising.csv
Loaded CSV: bodyfat.csv
Loaded CSV: cookie_recipes.csv
Loaded Excel: Pistachio.xlsx
Loaded CSV: BostonHousing.csv


In [6]:
results = []

In [7]:
import pandas as pd


# مسیر فایل در گوگل درایو
file_path = '/content/drive/MyDrive/my_csv_files/bodyfat.csv'

# بارگذاری فایل
data = pd.read_csv(file_path)

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Pct.BF', axis=1)
y = data['Pct.BF']

print(X.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Bodyfat', 'type': 'regression', 'table': df_results})

(250, 14)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.419217  29.486432      0.408658      30.022544
1        0.10       2  0.110041  41.196352      0.281145      33.275904
2        0.10       3  0.584576  26.284272      0.609465      24.709536
3        0.10       4  0.540317  34.161632      0.708230      21.683040
4        0.10       5  0.498518  29.937648      0.478922      31.107536
..        ...     ...       ...        ...           ...            ...
95       0.25      16  0.501854  31.450203      0.559539      27.808292
96       0.25      17  0.533641  29.547454      0.607994      24.836584
97       0.25      18  0.569441  29.288083      0.711113      19.651054
98       0.25      19  0.456579  37.367765      0.599993      27.506083
99       0.25      20  0.581122  29.877549      0.606551      28.063784

[100 rows x 6 columns]


In [9]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/advertising.csv')

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('sales', axis=1)
y= data['sales']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Advertising', 'type': 'regression', 'table': df_results})

(200, 3)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.963853  0.920060      0.980365       0.499780
1        0.10       2  0.911186  2.930440      0.961060       1.284820
2        0.10       3  0.888333  1.955000      0.945588       0.952620
3        0.10       4  0.918486  1.799300      0.967152       0.725080
4        0.10       5  0.857805  4.874300      0.976684       0.799240
..        ...     ...       ...       ...           ...            ...
95       0.25      16  0.927579  2.045296      0.969825       0.852192
96       0.25      17  0.890672  2.751120      0.958717       1.038840
97       0.25      18  0.941213  1.438080      0.954808       1.105512
98       0.25      19  0.902074  2.697560      0.956414       1.200672
99       0.25      20  0.901444  2.126680      0.965186       0.751232

[100 rows x 6 columns]


In [10]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/BostonHousing.csv')



# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('medv', axis=1)
y= data['medv']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)
results.append({'name': 'BostonHousing', 'type': 'regression', 'table': df_results})

(506, 13)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.423836  46.079231      0.792009      16.634275
1        0.10       2  0.593323  26.253176      0.836560      10.550965
2        0.10       3  0.361409  60.552392      0.891857      10.254290
3        0.10       4  0.642928  31.563843      0.836270      14.473106
4        0.10       5  0.502460  57.086337      0.801282      22.800329
..        ...     ...       ...        ...           ...            ...
95       0.25      16  0.536880  32.279052      0.832627      11.665735
96       0.25      17  0.555729  41.893367      0.809430      17.970154
97       0.25      18  0.530929  38.339553      0.813719      15.225657
98       0.25      19  0.475967  49.620986      0.738532      24.758567
99       0.25      20  0.458869  44.220929      0.773410      18.516850

[100 rows x 6 columns]


In [11]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/cookie_recipes.csv')


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y= data['rating']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_results})

(50, 5)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1 -0.721831  1.222500      0.973239       0.019000
1        0.10       2  0.424190  1.333000      0.936285       0.147500
2        0.10       3  0.497619  0.422000      0.988095       0.010000
3        0.10       4  0.241689  1.391500      0.965940       0.062500
4        0.10       5 -2.870588  0.329000     -0.088235       0.092500
..        ...     ...       ...       ...           ...            ...
95       0.25      16 -0.041073  1.305962      0.942512       0.072115
96       0.25      17 -0.456259  1.514423      0.826174       0.180769
97       0.25      18  0.104751  1.062115      0.937918       0.073654
98       0.25      19  0.074099  1.019038      0.853401       0.161346
99       0.25      20 -0.298036  1.779038      0.841025       0.217885

[100 rows x 6 columns]


In [12]:
from sklearn.datasets import fetch_california_housing
import pandas as pd


# بارگذاری داده‌های قیمت خانه‌های کالیفرنیا
data =  fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y= data.target

print(X.shape)

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'California_Housing', 'type': 'regression', 'table': df_results})

(20640, 8)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE
0        0.10       1  0.187966  1.064107      0.755844       0.319947
1        0.10       2  0.162775  1.078757      0.746426       0.326728
2        0.10       3  0.182270  1.069149      0.756413       0.318480
3        0.10       4  0.153402  1.096748      0.750532       0.323180
4        0.10       5  0.176171  1.083092      0.758227       0.317861
..        ...     ...       ...       ...           ...            ...
95       0.25      16  0.152718  1.136790      0.762162       0.319105
96       0.25      17  0.141095  1.110259      0.762341       0.307209
97       0.25      18  0.149980  1.161571      0.751551       0.339511
98       0.25      19  0.159897  1.166262      0.755512       0.339408
99       0.25      20  0.158440  1.138586      0.741514       0.349719

[100 rows x 6 columns]


In [13]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
X = data.data
y = data.target

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'wine', 'type': 'classification', 'table': df_results})

    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy
0        0.10       1      0.777778            0.944444
1        0.10       2      0.833333            0.888889
2        0.10       3      0.666667            0.833333
3        0.10       4      0.555556            0.944444
4        0.10       5      0.611111            0.666667
..        ...     ...           ...                 ...
95       0.25      16      0.755556            0.777778
96       0.25      17      0.688889            0.866667
97       0.25      18      0.733333            0.688889
98       0.25      19      0.644444            0.777778
99       0.25      20      0.688889            0.933333

[100 rows x 4 columns]


In [None]:
import openml
import pandas as pd

# دانلود دیتاست
dataset = openml.datasets.get_dataset(43977)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# افزودن ستون هدف به دیتافریم کامل
data['rating'] = y_data  # فرض بر این است که ستون هدف، rating است

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y = data['rating']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Jannis(43977)', 'type': 'classification', 'table': df_results})

(57580, 54)


In [None]:
import pandas as pd

# بارگذاری فایل اکسل
data = pd.read_excel('/content/drive/MyDrive/my_csv_files/Pistachio.xlsx')
# حذف سطرهای حاوی NaN
data = data.dropna()


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Class', axis=1)
y = data['Class']

print(data.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Pistachio', 'type': 'classification', 'table': df_results})

(2148, 17)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.790698            0.823256               2
1        0.10       2      0.800000            0.865116               5
2        0.10       3      0.725581            0.837209               4
3        0.10       4      0.706977            0.837209               6
4        0.10       5      0.800000            0.888372               7
..        ...     ...           ...                 ...             ...
95       0.25      16      0.739292            0.841713               5
96       0.25      17      0.756052            0.826816               3
97       0.25      18      0.728119            0.811918               2
98       0.25      19      0.729981            0.821229               4
99       0.25      20      0.765363            0.834264               4

[100 rows x 5 columns]


In [None]:
#
import openml
import pandas as pd

# دانلود دیتاست
dataset = openml.datasets.get_dataset(43971)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)
print(data.shape)
# افزودن ستون هدف به دیتافریم کامل
data['rating'] = y_data  # فرض بر این است که ستون هدف، rating است


# جدا کردن ویژگی‌ها و برچسب‌ها
X= data.drop('rating', axis=1)
y= data['rating']

print(X.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_results})

(13376, 10)
قبل از حذف NaN و ستون: (13376, 11)
Index(['fLength:', 'fWidth:', 'fSize:', 'fConc:', 'fConc1:', 'fAsym:',
       'fM3Long:', 'fM3Trans:', 'fAlpha:', 'fDist:', 'rating'],
      dtype='object')
   fLength:  fWidth:  fSize:  fConc:  fConc1:   fAsym:  fM3Long:  fM3Trans:  \
0   69.2979  26.8809  3.1930  0.2065   0.1074  39.6296   44.3457   -23.0604   
1   24.5939  10.1418  2.5676  0.5007   0.2693  -8.4503   15.2452    -7.0283   
2   55.4800  27.1606  3.1826  0.2299   0.1225  43.1016   54.2556    13.7406   
3   12.6594  11.7413  2.1351  0.7033   0.3846 -15.8596    9.4522    -8.7126   
4   38.6204  20.5632  2.9770  0.2478   0.1270 -13.8229  -31.3983   -13.1337   

   fAlpha:   fDist: rating  
0   9.3234  248.750      g  
1  17.0056  173.288      g  
2  32.2220  262.181      g  
3  43.5434  227.711      g  
4   5.8671  192.467      g  
بعد از حذف: (13376, 11)
(13376, 10)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.765321    

In [None]:

import openml
import pandas as pd

dataset = openml.datasets.get_dataset(1046)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()

print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']
data_features = X.columns


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_results})

In [None]:
# Climate Model Simulation
# شبیه‌سازی آب‌وهوا (دوتایی)
import openml
import pandas as pd

dataset = openml.datasets.get_dataset(40966)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()


print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_results})

In [None]:
import pandas as pd

def summarize_max_diff_results(dataset_tables):
    """
    dataset_tables: لیستی از دیکشنری‌ها به شکل زیر:
    [
        {'name': 'Diabetes', 'type': 'regression', 'table': df1},
        {'name': 'Wine', 'type': 'classification', 'table': df2},
        ...
    ]

    خروجی: DataFrame خلاصه از بیشترین اختلاف برای هر دیتاست
    """
    summary_rows = []

    for item in dataset_tables:
        name = item['name']
        dtype = item['type']
        df = item['table']

        if dtype == 'regression':
            # محاسبه بیشترین اختلاف MSE
            df['MSE_Diff'] = df['All_MSE'] - df['BestGroup_MSE']
            best_row = df.loc[df['MSE_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': None,
                'BestGroup_Accuracy': None,
                'All_R2': best_row['All_R2'],
                'BestGroup_R2': best_row['BestGroup_R2'],
                'All_MSE': best_row['All_MSE'],
                'BestGroup_MSE': best_row['BestGroup_MSE']
            })

        elif dtype == 'classification':
            # محاسبه بیشترین اختلاف دقت
            df['Accuracy_Diff'] = df['BestGroup_Accuracy'] - df['All_Accuracy']
            best_row = df.loc[df['Accuracy_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': best_row['All_Accuracy'],
                'BestGroup_Accuracy': best_row['BestGroup_Accuracy'],
                'All_R2': None,
                'BestGroup_R2': None,
                'All_MSE': None,
                'BestGroup_MSE': None
            })

    return pd.DataFrame(summary_rows)


In [None]:
results = [
    {'name': 'Bodyfat', 'type': 'regression', 'table': df_results},
    {'name': 'Advertising', 'type': 'regression', 'table': df_results},
    {'name': 'BostonHousing', 'type': 'regression', 'table': df_results},
    {'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_results},
    {'name': 'California_Housing', 'type': 'regression', 'table': df_results},
    {'name': 'wine', 'type': 'classification', 'table': df_results},
    {'name': 'Jannis(43977)', 'type': 'classification', 'table': df_results},
    {'name': 'Pistachio', 'type': 'classification', 'table': df_results},
    {'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_results},
    {'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_results},
    {'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_results}

]

summary = summarize_max_diff_results(results)
print(summary)
