In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [166]:
df = pd.read_csv('data.csv')

In [167]:
X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [168]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [169]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [170]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [171]:
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
preprocessor = Pipeline([
    ('te', TargetEncoder(cols=['nationality']))
])
knn = KNeighborsRegressor(n_neighbors=5)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', knn)
])
pipeline.fit(X_train, y_train)
print("===== Holdout evaluation:=====")
print("Trainig set:")
print_metrics(*evaluate_model(pipeline, X_train, y_train))

print("\nTest set:")
print_metrics(*evaluate_model(pipeline, X_test, y_test))

print("\n===== Cross-validation evaluation:=====")
cv_evaluate(pipeline, X, y)

===== Holdout evaluation:=====
Trainig set:
MSE: 4.414
MAE: 1.463
R2: 0.909

Test set:
MSE: 6.718
MAE: 1.799
R2: 0.860

===== Cross-validation evaluation:=====
test_r2: [0.87709957 0.85987542 0.86709442 0.86070557 0.86544988]
test_r2 trung bình: 0.866 ± 0.006

test_mae: [1.7194429  1.76986072 1.74644748 1.75374756 1.77770967]
test_mae trung bình: 1.753 ± 0.020

test_mse: [5.97611142 6.73895265 6.43399276 6.5790248  6.60637503]
test_mse trung bình: 6.467 ± 0.264



In [172]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score


def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    p = trial.suggest_int('p', 1, 2) 
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    pipeline.set_params(
    model__n_neighbors=n_neighbors,
    model__p=p,
    model__weights=weights,
    model__random_state=42,
    model__n_jobs = -1
    )
    score = cross_val_score(pipeline , X_train, y_train, cv=5, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) 

[I 2024-12-24 01:01:30,957] A new study created in memory with name: no-name-4cb18187-e32d-463a-b041-adf31a5d0795
[I 2024-12-24 01:01:31,325] Trial 0 finished with value: 0.8692609652254213 and parameters: {'n_neighbors': 15, 'p': 2, 'weights': 'uniform'}. Best is trial 0 with value: 0.8692609652254213.
[I 2024-12-24 01:01:32,051] Trial 1 finished with value: 0.8868758583152309 and parameters: {'n_neighbors': 20, 'p': 1, 'weights': 'distance'}. Best is trial 1 with value: 0.8868758583152309.
[I 2024-12-24 01:01:32,771] Trial 2 finished with value: 0.8846212517948742 and parameters: {'n_neighbors': 23, 'p': 1, 'weights': 'uniform'}. Best is trial 1 with value: 0.8868758583152309.
[I 2024-12-24 01:01:33,489] Trial 3 finished with value: 0.8879327597899269 and parameters: {'n_neighbors': 14, 'p': 1, 'weights': 'distance'}. Best is trial 3 with value: 0.8879327597899269.
[I 2024-12-24 01:01:34,172] Trial 4 finished with value: 0.8870418148083455 and parameters: {'n_neighbors': 11, 'p': 1, 

In [173]:
print("Best parameters:", study.best_params)
print("Best R2:", study.best_value)

Best parameters: {'n_neighbors': 14, 'p': 1, 'weights': 'distance'}
Best R2: 0.8879327597899269


In [174]:
pipeline.set_params(
    model__n_neighbors=study.best_params['n_neighbors'],
    model__p=study.best_params['p'],
    model__weights=study.best_params['weights'],
    model__random_state=42,
    model__n_jobs = -1
)

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)

y_pred = pipeline.predict(X_test)

y_data = pipeline.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

Train Set:
MSE: 0.0
MAE: 0.0
R2: 1.0

Test Set:
MSE: 5.321782219318997
MAE: 1.5408331999437999
R2: 0.888826871817688

Data Set:
MSE: 1.7741302408827706
MAE: 0.5136719883539171
R2: 0.9632801287173279
