<a href="https://colab.research.google.com/github/mehrnazeraeisi/Best-Features-Group-Results-on-KNN/blob/main/Best_Features_Group_Results_on_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def knn_auto_evaluator_from_xy(X_data, y_data, test_sizes=[0.1, 0.13, 0.16 , 0.19, 0.25], n_repeats=20):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
    from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
    import warnings
    warnings.filterwarnings("ignore")

    # تشخیص نوع تارگت
    if pd.api.types.is_numeric_dtype(y_data):
        target_type = 'regression' if len(np.unique(y_data)) > 10 else 'classification'
    else:
        target_type = 'classification'

    feature_names = X_data.columns.tolist()
    results = []

    for ts in test_sizes:
        for repeat in range(1, n_repeats+1):
            X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=ts)

            scores = []
            for feature in feature_names:
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[[feature]], y_train)
                y_pred = model.predict(X_train[[feature]])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                scores.append((feature, score))

            df_scores = pd.DataFrame(scores, columns=['Feature', 'Score']).sort_values(by='Score', ascending=False)

            best_score = -np.inf
            best_group = []
            for k in range(1, len(df_scores)+1):
                group = df_scores['Feature'][:k].tolist()
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[group], y_train)
                y_pred = model.predict(X_train[group])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                if score > best_score:
                    best_score = score
                    best_group = group

            model_all = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_best = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_all.fit(X_train, y_train)
            model_best.fit(X_train[best_group], y_train)
            y_pred_all = model_all.predict(X_test)
            y_pred_best = model_best.predict(X_test[best_group])

            if target_type == 'regression':
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_R2': r2_score(y_test, y_pred_all),
                    'All_MSE': mean_squared_error(y_test, y_pred_all),
                    'BestGroup_R2': r2_score(y_test, y_pred_best),
                    'BestGroup_MSE': mean_squared_error(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_R2', 'All_MSE', 'BestGroup_R2', 'BestGroup_MSE']
            else:
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_Accuracy': accuracy_score(y_test, y_pred_all),
                    'BestGroup_Accuracy': accuracy_score(y_test, y_pred_best),

                }
                column_order = ['Test Size', 'Repeat', 'All_Accuracy', 'BestGroup_Accuracy']

            results.append(result_row)

    # ساخت DataFrame نهایی با ترتیب دقیق ستون‌ها
    return pd.DataFrame(results)[column_order]


In [None]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [None]:
import pandas as pd
import os

folder_path = '/content/drive/MyDrive/my_csv_files'

# لیست تمام فایل‌های CSV و Excel
files = os.listdir(folder_path)

dataframes = []

for file in files:
    file_path = os.path.join(folder_path, file)

    if file.endswith('.csv'):
        df = pd.read_csv(file_path)
        print(f"Loaded CSV: {file}")

    elif file.endswith('.xlsx') or file.endswith('.xls'):
        df = pd.read_excel(file_path)
        print(f"Loaded Excel: {file}")

    else:
        print(f"Skipped unsupported file: {file}")
        continue

    dataframes.append(df)


In [None]:
results = []

In [None]:
import pandas as pd


# مسیر فایل در گوگل درایو
file_path = '/content/drive/MyDrive/my_csv_files/bodyfat.csv'

# بارگذاری فایل
data = pd.read_csv(file_path)

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Pct.BF', axis=1)
y = data['Pct.BF']

print(X.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Bodyfat', 'type': 'regression', 'table': df_results})

(250, 14)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.681876  18.391184      0.658823      19.723904   
1        0.10       2  0.672007  30.814640      0.684761      29.616400   
2        0.10       3  0.725342  18.687792      0.777707      15.124880   
3        0.10       4  0.560290  26.714800      0.404347      36.189152   
4        0.10       5  0.632024  29.812416      0.731632      21.742496   
..        ...     ...       ...        ...           ...            ...   
95       0.25      16  0.640825  29.548102      0.622324      31.070152   
96       0.25      17  0.643078  27.752254      0.668668      25.762571   
97       0.25      18  0.501166  28.379441      0.429264      32.470070   
98       0.25      19  0.543435  34.613130      0.610796      29.506394   
99       0.25      20  0.647670  21.445308      0.625781      22.777619   

    BestGroup Size  
0                9  
1                4  
2               12  
3    

In [None]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/advertising.csv')

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Pct.BF', axis=1)
y = data['Pct.BF']

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('sales', axis=1)
y= data['sales']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Advertising', 'type': 'regression', 'table': df_results})

(200, 3)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.964943  0.954440      0.984500       0.422000   
1        0.10       2  0.969972  0.887780      0.980583       0.574060   
2        0.10       3  0.872489  2.564180      0.967207       0.659460   
3        0.10       4  0.917554  2.991300      0.931708       2.477780   
4        0.10       5  0.929257  1.491680      0.973599       0.556680   
..        ...     ...       ...       ...           ...            ...   
95       0.25      16  0.922305  2.560896      0.961670       1.263384   
96       0.25      17  0.918546  2.407288      0.953062       1.387200   
97       0.25      18  0.913628  2.247872      0.966034       0.883968   
98       0.25      19  0.922666  1.413320      0.937424       1.143608   
99       0.25      20  0.933725  1.842632      0.971975       0.779192   

    BestGroup Size  
0                2  
1                2  
2                2  
3                2

In [None]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/BostonHousing.csv')



# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('medv', axis=1)
y= data['medv']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)
results.append({'name': 'BostonHousing', 'type': 'regression', 'table': df_results})

(506, 13)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.461981  37.868329      0.828331      12.082847   
1        0.10       2  0.462183  34.197420      0.724891      17.492980   
2        0.10       3  0.626196  17.912149      0.827907       8.246463   
3        0.10       4  0.499698  44.915553      0.787902      19.041522   
4        0.10       5  0.594493  45.088227      0.904275      10.643647   
..        ...     ...       ...        ...           ...            ...   
95       0.25      16  0.652965  34.740602      0.826463      17.372205   
96       0.25      17  0.522003  40.931685      0.713643      24.521175   
97       0.25      18  0.399042  56.572135      0.700057      28.235657   
98       0.25      19  0.500818  39.576211      0.740000      20.613332   
99       0.25      20  0.562341  31.271373      0.776025      16.003329   

    BestGroup Size  
0                4  
1                5  
2                4  
3    

In [None]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('/content/drive/MyDrive/my_csv_files/cookie_recipes.csv')


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y= data['rating']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_results})

(50, 5)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1 -0.066933  2.000500      0.981867       0.034000   
1        0.10       2  0.191483  1.281500      0.981703       0.029000   
2        0.10       3 -0.517293  1.009000      0.922556       0.051500   
3        0.10       4  0.250492  1.143000      0.983607       0.025000   
4        0.10       5 -0.745062  1.413500      0.972222       0.022500   
..        ...     ...       ...       ...           ...            ...   
95       0.25      16 -0.156218  1.193846      0.903711       0.099423   
96       0.25      17  0.189206  0.936731      0.920102       0.092308   
97       0.25      18 -0.152684  1.312115      0.868057       0.150192   
98       0.25      19 -0.279788  1.607308      0.961720       0.048077   
99       0.25      20 -0.023022  1.587500      0.888837       0.172500   

    BestGroup Size  
0                2  
1                2  
2                2  
3                2 

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd


# بارگذاری داده‌های قیمت خانه‌های کالیفرنیا
data =  fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y= data.target

print(X.shape)

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'California_Housing', 'type': 'regression', 'table': df_results})

(20640, 8)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.142189  1.113761      0.752107       0.321858   
1        0.10       2  0.161395  1.119952      0.753139       0.329682   
2        0.10       3  0.158542  1.154931      0.771324       0.313866   
3        0.10       4  0.182538  1.078940      0.752998       0.326009   
4        0.10       5  0.180928  1.073027      0.766139       0.306370   
..        ...     ...       ...       ...           ...            ...   
95       0.25      16  0.137955  1.168975      0.759572       0.326032   
96       0.25      17  0.137816  1.147727      0.751359       0.330987   
97       0.25      18  0.153993  1.098401      0.758202       0.313936   
98       0.25      19  0.158825  1.133839      0.749905       0.337109   
99       0.25      20  0.160110  1.113141      0.746891       0.335456   

    BestGroup Size  
0                6  
1                6  
2                6  
3               

In [None]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
X = data.data
y = data.target

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'wine', 'type': 'classification', 'table': df_results})

    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.611111            0.888889               2
1        0.10       2      0.833333            0.888889               4
2        0.10       3      0.611111            0.833333               2
3        0.10       4      0.666667            0.555556               8
4        0.10       5      0.666667            0.833333               1
..        ...     ...           ...                 ...             ...
95       0.25      16      0.733333            0.977778               4
96       0.25      17      0.577778            0.755556               1
97       0.25      18      0.777778            0.911111               2
98       0.25      19      0.533333            0.911111               2
99       0.25      20      0.711111            0.933333               2

[100 rows x 5 columns]


In [None]:
import openml
import pandas as pd

# دانلود دیتاست
dataset = openml.datasets.get_dataset(43977)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# افزودن ستون هدف به دیتافریم کامل
data['rating'] = y_data  # فرض بر این است که ستون هدف، rating است

# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y = data['rating']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Jannis(43977)', 'type': 'classification', 'table': df_results})

(57580, 54)


In [None]:
import pandas as pd

# بارگذاری فایل اکسل
data = pd.read_excel('/content/drive/MyDrive/my_csv_files/Pistachio.xlsx')
# حذف سطرهای حاوی NaN
data = data.dropna()


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Class', axis=1)
y = data['Class']

print(data.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Pistachio', 'type': 'classification', 'table': df_results})

(2148, 17)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.790698            0.823256               2
1        0.10       2      0.800000            0.865116               5
2        0.10       3      0.725581            0.837209               4
3        0.10       4      0.706977            0.837209               6
4        0.10       5      0.800000            0.888372               7
..        ...     ...           ...                 ...             ...
95       0.25      16      0.739292            0.841713               5
96       0.25      17      0.756052            0.826816               3
97       0.25      18      0.728119            0.811918               2
98       0.25      19      0.729981            0.821229               4
99       0.25      20      0.765363            0.834264               4

[100 rows x 5 columns]


In [None]:
#
import openml
import pandas as pd

# دانلود دیتاست
dataset = openml.datasets.get_dataset(43971)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)
print(data.shape)
# افزودن ستون هدف به دیتافریم کامل
data['rating'] = y_data  # فرض بر این است که ستون هدف، rating است


# جدا کردن ویژگی‌ها و برچسب‌ها
X= data.drop('rating', axis=1)
y= data['rating']

print(X.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_results})

(13376, 10)
قبل از حذف NaN و ستون: (13376, 11)
Index(['fLength:', 'fWidth:', 'fSize:', 'fConc:', 'fConc1:', 'fAsym:',
       'fM3Long:', 'fM3Trans:', 'fAlpha:', 'fDist:', 'rating'],
      dtype='object')
   fLength:  fWidth:  fSize:  fConc:  fConc1:   fAsym:  fM3Long:  fM3Trans:  \
0   69.2979  26.8809  3.1930  0.2065   0.1074  39.6296   44.3457   -23.0604   
1   24.5939  10.1418  2.5676  0.5007   0.2693  -8.4503   15.2452    -7.0283   
2   55.4800  27.1606  3.1826  0.2299   0.1225  43.1016   54.2556    13.7406   
3   12.6594  11.7413  2.1351  0.7033   0.3846 -15.8596    9.4522    -8.7126   
4   38.6204  20.5632  2.9770  0.2478   0.1270 -13.8229  -31.3983   -13.1337   

   fAlpha:   fDist: rating  
0   9.3234  248.750      g  
1  17.0056  173.288      g  
2  32.2220  262.181      g  
3  43.5434  227.711      g  
4   5.8671  192.467      g  
بعد از حذف: (13376, 11)
(13376, 10)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.765321    

In [None]:

import openml
import pandas as pd

dataset = openml.datasets.get_dataset(1046)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()

print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']
data_features = X.columns


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_results})

In [None]:
# Climate Model Simulation
# شبیه‌سازی آب‌وهوا (دوتایی)
import openml
import pandas as pd

dataset = openml.datasets.get_dataset(40966)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()


print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_results})

In [None]:
import pandas as pd

def summarize_max_diff_results(dataset_tables):
    """
    dataset_tables: لیستی از دیکشنری‌ها به شکل زیر:
    [
        {'name': 'Diabetes', 'type': 'regression', 'table': df1},
        {'name': 'Wine', 'type': 'classification', 'table': df2},
        ...
    ]

    خروجی: DataFrame خلاصه از بیشترین اختلاف برای هر دیتاست
    """
    summary_rows = []

    for item in dataset_tables:
        name = item['name']
        dtype = item['type']
        df = item['table']

        if dtype == 'regression':
            # محاسبه بیشترین اختلاف MSE
            df['MSE_Diff'] = df['All_MSE'] - df['BestGroup_MSE']
            best_row = df.loc[df['MSE_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': None,
                'BestGroup_Accuracy': None,
                'All_R2': best_row['All_R2'],
                'BestGroup_R2': best_row['BestGroup_R2'],
                'All_MSE': best_row['All_MSE'],
                'BestGroup_MSE': best_row['BestGroup_MSE']
            })

        elif dtype == 'classification':
            # محاسبه بیشترین اختلاف دقت
            df['Accuracy_Diff'] = df['BestGroup_Accuracy'] - df['All_Accuracy']
            best_row = df.loc[df['Accuracy_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': best_row['All_Accuracy'],
                'BestGroup_Accuracy': best_row['BestGroup_Accuracy'],
                'All_R2': None,
                'BestGroup_R2': None,
                'All_MSE': None,
                'BestGroup_MSE': None
            })

    return pd.DataFrame(summary_rows)


In [None]:
results = [
    {'name': 'Bodyfat', 'type': 'regression', 'table': df_results},
    {'name': 'Advertising', 'type': 'regression', 'table': df_results},
    {'name': 'BostonHousing', 'type': 'regression', 'table': df_results},
    {'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_results},
    {'name': 'California_Housing', 'type': 'regression', 'table': df_results},
    {'name': 'wine', 'type': 'classification', 'table': df_results},
    {'name': 'Jannis(43977)', 'type': 'classification', 'table': df_results},
    {'name': 'Pistachio', 'type': 'classification', 'table': df_results},
    {'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_results},
    {'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_results},
    {'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_results}

]

summary = summarize_max_diff_results(results)
print(summary)
