In [168]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [169]:
df = pd.read_csv('data.csv')

In [170]:
columns_with_large_values = df.columns[(df > 100).any()]
print(columns_with_large_values)

Index(['height_cm'], dtype='object')


In [171]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
df['height_cm'] = scaler.fit_transform(df['height_cm'].values.reshape(-1, 1))

In [172]:
X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [173]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [174]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score


def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    p = trial.suggest_int('p', 1, 2) 
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])

    knn = KNeighborsRegressor(n_neighbors=n_neighbors, p=p, weights=weights , n_jobs=-1)
    
    score = cross_val_score(knn, X_train, y_train, cv=5, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) 


[I 2024-12-23 01:47:36,597] A new study created in memory with name: no-name-6e40ea18-60ec-4afa-bdb2-56411e6dce37
[I 2024-12-23 01:47:36,845] Trial 0 finished with value: 0.9069756374822628 and parameters: {'n_neighbors': 24, 'p': 2, 'weights': 'distance'}. Best is trial 0 with value: 0.9069756374822628.
[I 2024-12-23 01:47:37,039] Trial 1 finished with value: 0.9089776868602151 and parameters: {'n_neighbors': 11, 'p': 2, 'weights': 'uniform'}. Best is trial 1 with value: 0.9089776868602151.
[I 2024-12-23 01:47:37,234] Trial 2 finished with value: 0.9098296903093225 and parameters: {'n_neighbors': 12, 'p': 2, 'weights': 'distance'}. Best is trial 2 with value: 0.9098296903093225.
[I 2024-12-23 01:47:37,421] Trial 3 finished with value: 0.9097520544700839 and parameters: {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}. Best is trial 2 with value: 0.9098296903093225.
[I 2024-12-23 01:47:37,657] Trial 4 finished with value: 0.9072269379204162 and parameters: {'n_neighbors': 23, 'p': 2, 

In [175]:
print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)

Best parameters: {'n_neighbors': 14, 'p': 1, 'weights': 'distance'}
Best accuracy: 0.9295409432513413


In [176]:
knn = KNeighborsRegressor(n_neighbors=study.best_params['n_neighbors'], p=study.best_params['p'], weights=study.best_params['weights'], n_jobs=-1)
knn.fit(X_train, y_train)

y_train_pred = knn.predict(X_train)

y_pred = knn.predict(X_test)

y_data = knn.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

Train Set:
MSE: 0.0
MAE: 0.0
R2: 1.0

Test Set:
MSE: 3.2719674619590497
MAE: 1.327023662935483
R2: 0.9294130653275741

Data Set:
MSE: 1.090591236670237
MAE: 0.44234103466128183
R2: 0.976660957085409
