In [251]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [252]:
df = pd.read_csv('data.csv')

In [253]:
X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [254]:
columns_with_large_values = X.select_dtypes(exclude='object').columns[(X.select_dtypes(exclude='object') > 100).any()]
print(columns_with_large_values)

Index(['height_cm'], dtype='object')


In [255]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X['height_cm'] = scaler.fit_transform(X['height_cm'].values.reshape(-1, 1))

In [256]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [257]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [258]:
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
preprocessor = Pipeline([
    ('te', TargetEncoder(cols=['nationality']))
])
knn = KNeighborsRegressor(n_neighbors=5)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', knn)
])
pipeline.fit(X_train, y_train)


In [259]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score


def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    p = trial.suggest_int('p', 1, 2) 
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    pipeline.set_params(
    model__n_neighbors=n_neighbors,
    model__p=p,
    model__weights=weights,
    model__n_jobs = -1
    )
    score = cross_val_score(pipeline , X_train, y_train, cv=5, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) 

[I 2024-12-24 01:49:28,485] A new study created in memory with name: no-name-4c07d1a0-fa22-4b58-b044-3e49f7b66ef9
[I 2024-12-24 01:49:29,198] Trial 0 finished with value: 0.8881259010499969 and parameters: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}. Best is trial 0 with value: 0.8881259010499969.
[I 2024-12-24 01:49:29,860] Trial 1 finished with value: 0.8858361963698501 and parameters: {'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}. Best is trial 0 with value: 0.8881259010499969.
[I 2024-12-24 01:49:30,550] Trial 2 finished with value: 0.8848807467304546 and parameters: {'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}. Best is trial 0 with value: 0.8881259010499969.
[I 2024-12-24 01:49:30,977] Trial 3 finished with value: 0.8671759227725673 and parameters: {'n_neighbors': 23, 'p': 2, 'weights': 'uniform'}. Best is trial 0 with value: 0.8881259010499969.
[I 2024-12-24 01:49:31,692] Trial 4 finished with value: 0.8866672019712519 and parameters: {'n_neighbors': 16, 'p': 1, 'we

In [260]:
print("Best parameters:", study.best_params)
print("Best R2:", study.best_value)

Best parameters: {'n_neighbors': 14, 'p': 1, 'weights': 'distance'}
Best R2: 0.8881505959180591


In [261]:
pipeline.set_params(
    model__n_neighbors=study.best_params['n_neighbors'],
    model__p=study.best_params['p'],
    model__weights=study.best_params['weights'],
    model__n_jobs = -1
)
pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)

y_pred = pipeline.predict(X_test)

y_data = pipeline.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

Train Set:
MSE: 0.0
MAE: 0.0
R2: 1.0

Test Set:
MSE: 5.261709474076671
MAE: 1.532081045052238
R2: 0.8900818038558396

Data Set:
MSE: 1.7540605931672082
MAE: 0.5107431566860317
R2: 0.9636955181086035
