<a href="https://colab.research.google.com/github/mehrnazeraeisi/Best-Features-Group-Results-on-KNN/blob/main/Best_Features_Group_Results_on_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
def knn_auto_evaluator_from_xy(X_data, y_data, test_sizes=[0.1, 0.13, 0.16 , 0.19, 0.25], n_repeats=20):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
    from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
    import warnings
    warnings.filterwarnings("ignore")

    # تشخیص نوع تارگت
    if pd.api.types.is_numeric_dtype(y_data):
        target_type = 'regression' if len(np.unique(y_data)) > 10 else 'classification'
    else:
        target_type = 'classification'

    feature_names = X_data.columns.tolist()
    results = []

    for ts in test_sizes:
        for repeat in range(1, n_repeats+1):
            X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=ts)

            scores = []
            for feature in feature_names:
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[[feature]], y_train)
                y_pred = model.predict(X_train[[feature]])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                scores.append((feature, score))

            df_scores = pd.DataFrame(scores, columns=['Feature', 'Score']).sort_values(by='Score', ascending=False)

            best_score = -np.inf
            best_group = []
            for k in range(1, len(df_scores)+1):
                group = df_scores['Feature'][:k].tolist()
                model = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
                model.fit(X_train[group], y_train)
                y_pred = model.predict(X_train[group])
                score = r2_score(y_train, y_pred) if target_type == 'regression' else accuracy_score(y_train, y_pred)
                if score > best_score:
                    best_score = score
                    best_group = group

            model_all = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_best = KNeighborsRegressor() if target_type == 'regression' else KNeighborsClassifier()
            model_all.fit(X_train, y_train)
            model_best.fit(X_train[best_group], y_train)
            y_pred_all = model_all.predict(X_test)
            y_pred_best = model_best.predict(X_test[best_group])

            if target_type == 'regression':
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_R2': r2_score(y_test, y_pred_all),
                    'All_MSE': mean_squared_error(y_test, y_pred_all),
                    'BestGroup_R2': r2_score(y_test, y_pred_best),
                    'BestGroup_MSE': mean_squared_error(y_test, y_pred_best),
                    'BestGroup Size': len(best_group)
                }
                column_order = ['Test Size', 'Repeat', 'All_R2', 'All_MSE', 'BestGroup_R2', 'BestGroup_MSE', 'BestGroup Size']
            else:
                result_row = {
                    'Test Size': ts,
                    'Repeat': repeat,
                    'All_Accuracy': accuracy_score(y_test, y_pred_all),
                    'BestGroup_Accuracy': accuracy_score(y_test, y_pred_best),
                    'BestGroup Size': len(best_group)
                }
                column_order = ['Test Size', 'Repeat', 'All_Accuracy', 'BestGroup_Accuracy', 'BestGroup Size']

            results.append(result_row)

    # ساخت DataFrame نهایی با ترتیب دقیق ستون‌ها
    return pd.DataFrame(results)[column_order]


In [39]:
!pip install openml



In [40]:
results = []

In [41]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('bodyfat.csv')


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Pct.BF', axis=1)
y= data['Pct.BF']


print(X.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Bodyfat', 'type': 'regression', 'table': df_results})

(250, 14)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.605321  29.571680      0.677308      24.178000   
1        0.10       2  0.723277  18.092944      0.789383      13.770752   
2        0.10       3  0.372797  28.006224      0.220051      34.826752   
3        0.10       4  0.275963  34.090944      0.431441      26.770336   
4        0.10       5  0.609246  26.262864      0.726025      18.414064   
..        ...     ...       ...        ...           ...            ...   
95       0.25      16  0.534107  31.918794      0.595406      27.719181   
96       0.25      17  0.507036  40.709016      0.577582      34.883321   
97       0.25      18  0.465182  35.135886      0.517647      31.689105   
98       0.25      19  0.553975  35.035689      0.553175      35.098502   
99       0.25      20  0.489560  32.558343      0.572325      27.279194   

    BestGroup Size  
0                6  
1                7  
2                4  
3    

In [42]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('advertising.csv')



# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('sales', axis=1)
y= data['sales']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Advertising', 'type': 'regression', 'table': df_results})

(200, 3)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.853259  2.519700      0.959226       0.700140   
1        0.10       2  0.877551  2.588140      0.951972       1.015140   
2        0.10       3  0.926652  2.314680      0.959724       1.271000   
3        0.10       4  0.880115  3.078340      0.961445       0.990000   
4        0.10       5  0.971056  0.596560      0.979042       0.431960   
..        ...     ...       ...       ...           ...            ...   
95       0.25      16  0.914089  2.139632      0.950383       1.235712   
96       0.25      17  0.907451  2.740576      0.962306       1.116184   
97       0.25      18  0.946659  1.328456      0.970453       0.735864   
98       0.25      19  0.946201  1.479504      0.963709       0.998016   
99       0.25      20  0.907645  3.060128      0.941008       1.954672   

    BestGroup Size  
0                2  
1                2  
2                2  
3                2

In [43]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('BostonHousing.csv')



# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('medv', axis=1)
y= data['medv']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)
results.append({'name': 'BostonHousing', 'type': 'regression', 'table': df_results})

(506, 13)
    Test Size  Repeat    All_R2    All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.698922  26.041898      0.875070      10.805898   
1        0.10       2  0.287161  34.704722      0.685654      15.304024   
2        0.10       3  0.659594  27.636149      0.756643      19.757106   
3        0.10       4  0.670401  29.268486      0.859472      12.478918   
4        0.10       5  0.486180  48.488204      0.737875      24.736196   
..        ...     ...       ...        ...           ...            ...   
95       0.25      16  0.549483  34.474217      0.782469      16.645824   
96       0.25      17  0.295023  60.866236      0.780882      18.918142   
97       0.25      18  0.608110  37.891430      0.806545      18.704929   
98       0.25      19  0.532905  44.259616      0.826658      16.425068   
99       0.25      20  0.527611  44.348724      0.850005      14.081786   

    BestGroup Size  
0                4  
1                4  
2                6  
3    

In [44]:
import pandas as pd

# بارگذاری فایل
data = pd.read_csv('cookie_recipes.csv')


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('rating', axis=1)
y= data['rating']

print(X.shape)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Cookie_Recipes', 'type': 'regression', 'table': df_results})

(50, 5)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.604494  0.528000      0.845318       0.206500   
1        0.10       2 -0.350952  1.418500      0.890000       0.115500   
2        0.10       3 -1.325532  0.546500      0.708511       0.068500   
3        0.10       4  0.518103  0.559000      0.976293       0.027500   
4        0.10       5  0.334824  1.041000      0.946965       0.083000   
..        ...     ...       ...       ...           ...            ...   
95       0.25      16 -0.013787  1.554423      0.972533       0.042115   
96       0.25      17 -0.215294  1.390577      0.963866       0.041346   
97       0.25      18 -0.125806  1.677885      0.964645       0.052692   
98       0.25      19  0.020483  1.230192      0.937067       0.079038   
99       0.25      20 -0.261176  1.586731      0.891476       0.136538   

    BestGroup Size  
0                2  
1                2  
2                2  
3                2 

In [45]:
from sklearn.datasets import fetch_california_housing
import pandas as pd


# بارگذاری داده‌های قیمت خانه‌های کالیفرنیا
data =  fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y= data.target

print(X.shape)

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'California_Housing', 'type': 'regression', 'table': df_results})

(20640, 8)
    Test Size  Repeat    All_R2   All_MSE  BestGroup_R2  BestGroup_MSE  \
0        0.10       1  0.189981  1.104182      0.763497       0.322390   
1        0.10       2  0.131980  1.144015      0.755058       0.322824   
2        0.10       3  0.143528  1.118629      0.735296       0.345727   
3        0.10       4  0.153098  1.142771      0.764340       0.317989   
4        0.10       5  0.187486  1.112069      0.767564       0.318130   
..        ...     ...       ...       ...           ...            ...   
95       0.25      16  0.137947  1.123397      0.761978       0.310181   
96       0.25      17  0.149222  1.126264      0.756173       0.322780   
97       0.25      18  0.149472  1.139188      0.751869       0.332344   
98       0.25      19  0.173378  1.071361      0.754148       0.318641   
99       0.25      20  0.135865  1.175535      0.761521       0.324417   

    BestGroup Size  
0                6  
1                6  
2                6  
3               

In [33]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
X = data.data
y = data.target

df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'wine', 'type': 'classification', 'table': df_results})

    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.777778            0.944444               4
1        0.10       2      0.777778            0.833333               2
2        0.10       3      0.444444            0.666667               1
3        0.10       4      0.611111            0.944444               3
4        0.10       5      0.555556            0.666667               1
..        ...     ...           ...                 ...             ...
95       0.25      16      0.711111            0.844444               2
96       0.25      17      0.688889            0.933333               2
97       0.25      18      0.600000            0.800000               1
98       0.25      19      0.711111            0.755556               2
99       0.25      20      0.733333            0.955556               2

[100 rows x 5 columns]


In [34]:
import pandas as pd

# بارگذاری فایل اکسل
data = pd.read_excel('Pistachio.xlsx')
# حذف سطرهای حاوی NaN
data = data.dropna()


# جدا کردن ویژگی‌ها و برچسب‌ها
X = data.drop('Class', axis=1)
y = data['Class']

print(data.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Pistachio', 'type': 'classification', 'table': df_results})

(2148, 17)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.790698            0.855814               7
1        0.10       2      0.748837            0.832558               4
2        0.10       3      0.795349            0.795349               7
3        0.10       4      0.730233            0.809302               7
4        0.10       5      0.739535            0.832558               5
..        ...     ...           ...                 ...             ...
95       0.25      16      0.743017            0.832402               5
96       0.25      17      0.733706            0.806331               4
97       0.25      18      0.770950            0.834264               3
98       0.25      19      0.748603            0.821229               6
99       0.25      20      0.765363            0.839851               3

[100 rows x 5 columns]


In [36]:
#
import openml
import pandas as pd

# دانلود دیتاست
dataset = openml.datasets.get_dataset(43971)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)
print(data.shape)
# افزودن ستون هدف به دیتافریم کامل
data['rating'] = y_data  # فرض بر این است که ستون هدف، rating است


# جدا کردن ویژگی‌ها و برچسب‌ها
X= data.drop('rating', axis=1)
y= data['rating']

print(X.shape)


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'MagicTelescope(43971)', 'type': 'classification', 'table': df_results})

(13376, 10)
قبل از حذف NaN و ستون: (13376, 11)
Index(['fLength:', 'fWidth:', 'fSize:', 'fConc:', 'fConc1:', 'fAsym:',
       'fM3Long:', 'fM3Trans:', 'fAlpha:', 'fDist:', 'rating'],
      dtype='object')
   fLength:  fWidth:  fSize:  fConc:  fConc1:   fAsym:  fM3Long:  fM3Trans:  \
0   69.2979  26.8809  3.1930  0.2065   0.1074  39.6296   44.3457   -23.0604   
1   24.5939  10.1418  2.5676  0.5007   0.2693  -8.4503   15.2452    -7.0283   
2   55.4800  27.1606  3.1826  0.2299   0.1225  43.1016   54.2556    13.7406   
3   12.6594  11.7413  2.1351  0.7033   0.3846 -15.8596    9.4522    -8.7126   
4   38.6204  20.5632  2.9770  0.2478   0.1270 -13.8229  -31.3983   -13.1337   

   fAlpha:   fDist: rating  
0   9.3234  248.750      g  
1  17.0056  173.288      g  
2  32.2220  262.181      g  
3  43.5434  227.711      g  
4   5.8671  192.467      g  
بعد از حذف: (13376, 11)
(13376, 10)
    Test Size  Repeat  All_Accuracy  BestGroup_Accuracy  BestGroup Size
0        0.10       1      0.765321    

In [None]:

import openml
import pandas as pd

dataset = openml.datasets.get_dataset(1046)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()

print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']
data_features = X.columns


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Hill_Valley_with_Noise(1046)', 'type': 'classification', 'table': df_results})

In [None]:
# Climate Model Simulation
# شبیه‌سازی آب‌وهوا (دوتایی)
import openml
import pandas as pd

dataset = openml.datasets.get_dataset(40966)
data, y_data, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 🔧 افزودن target به dataframe کامل (مثل خواندن فایل CSV)
data['rating'] = y_data  # فرض می‌گیریم نام ستون هدف "rating" باشه برای یکسان بودن با مثال شما

print(data.shape)
# حذف سطرهای حاوی NaN
data = data.dropna()


print(data.shape)

# 🎯 جدا کردن ویژگی‌ها و برچسب‌ها (مطابق ساختار شما)
X = data.drop('rating', axis=1)
y = data['rating']


df_results = knn_auto_evaluator_from_xy(X, y)
print(df_results)

results.append({'name': 'Climate Model Simulation', 'type': 'classification', 'table': df_results})

In [None]:
import pandas as pd

def summarize_max_diff_results(dataset_tables):
    """
    dataset_tables: لیستی از دیکشنری‌ها به شکل زیر:
    [
        {'name': 'Diabetes', 'type': 'regression', 'table': df1},
        {'name': 'Wine', 'type': 'classification', 'table': df2},
        ...
    ]

    خروجی: DataFrame خلاصه از بیشترین اختلاف برای هر دیتاست
    """
    summary_rows = []

    for item in dataset_tables:
        name = item['name']
        dtype = item['type']
        df = item['table']

        if dtype == 'regression':
            # محاسبه بیشترین اختلاف MSE
            df['MSE_Diff'] = df['All_MSE'] - df['BestGroup_MSE']
            best_row = df.loc[df['MSE_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': None,
                'BestGroup_Accuracy': None,
                'All_R2': best_row['All_R2'],
                'BestGroup_R2': best_row['BestGroup_R2'],
                'All_MSE': best_row['All_MSE'],
                'BestGroup_MSE': best_row['BestGroup_MSE']
            })

        elif dtype == 'classification':
            # محاسبه بیشترین اختلاف دقت
            df['Accuracy_Diff'] = df['BestGroup_Accuracy'] - df['All_Accuracy']
            best_row = df.loc[df['Accuracy_Diff'].idxmax()]
            summary_rows.append({
                'Dataset': name,
                'Type': dtype,
                'All_Accuracy': best_row['All_Accuracy'],
                'BestGroup_Accuracy': best_row['BestGroup_Accuracy'],
                'All_R2': None,
                'BestGroup_R2': None,
                'All_MSE': None,
                'BestGroup_MSE': None
            })

    return pd.DataFrame(summary_rows)


In [None]:
results = [
    {'name': 'Diabetes', 'type': 'regression', 'table': df_diabetes},
    {'name': 'BodyFat', 'type': 'regression', 'table': df_bodyfat},
    {'name': 'Wine', 'type': 'classification', 'table': df_wine},
    {'name': 'Cancer', 'type': 'classification', 'table': df_cancer},
]

summary = summarize_max_diff_results(results)
print(summary)
