In [118]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [119]:
df = pd.read_csv('data.csv')

In [120]:
X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [121]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [122]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [123]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [124]:
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
preprocessor = Pipeline([
    ('te', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler())
])
knn = KNeighborsRegressor(n_neighbors=5)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', knn)
])
pipeline.fit(X_train, y_train)
print("===== Holdout evaluation:=====")
print("Trainig set:")
print_metrics(*evaluate_model(pipeline, X_train, y_train))

print("\nTest set:")
print_metrics(*evaluate_model(pipeline, X_test, y_test))

print("\n===== Cross-validation evaluation:=====")
cv_evaluate(pipeline, X, y)

===== Holdout evaluation:=====
Trainig set:
MSE: 6.700
MAE: 1.968
R2: 0.862

Test set:
MSE: 10.104
MAE: 2.406
R2: 0.789

===== Cross-validation evaluation:=====
test_r2: [0.8023281  0.79517177 0.79653998 0.78845099 0.79209159]
test_r2 trung bình: 0.795 ± 0.005

test_mae: [2.35041783 2.35721448 2.3848983  2.38161048 2.40462524]
test_mae trung bình: 2.376 ± 0.020

test_mse: [ 9.61192201  9.85071866  9.84955141  9.99168571 10.20824742]
test_mse trung bình: 9.902 ± 0.196



In [125]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score


def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    p = trial.suggest_int('p', 1, 2) 
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    pipeline.set_params(
    model__n_neighbors=n_neighbors,
    model__p=p,
    model__weights=weights,
    model__n_jobs = -1
    )
    score = cross_val_score(pipeline , X_train, y_train, cv=5, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) 

[I 2024-12-24 00:36:58,337] A new study created in memory with name: no-name-5f7ab4dc-8d92-4372-af40-02ef8faa3de6
[I 2024-12-24 00:36:58,796] Trial 0 finished with value: 0.6149758841642835 and parameters: {'n_neighbors': 1, 'p': 2, 'weights': 'distance'}. Best is trial 0 with value: 0.6149758841642835.
[I 2024-12-24 00:36:59,300] Trial 1 finished with value: 0.7968524756778862 and parameters: {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}. Best is trial 1 with value: 0.7968524756778862.
[I 2024-12-24 00:36:59,820] Trial 2 finished with value: 0.7921669158677576 and parameters: {'n_neighbors': 18, 'p': 2, 'weights': 'uniform'}. Best is trial 1 with value: 0.7968524756778862.
[I 2024-12-24 00:37:00,342] Trial 3 finished with value: 0.7886958010701619 and parameters: {'n_neighbors': 22, 'p': 2, 'weights': 'uniform'}. Best is trial 1 with value: 0.7968524756778862.
[I 2024-12-24 00:37:01,156] Trial 4 finished with value: 0.8515005212015445 and parameters: {'n_neighbors': 10, 'p': 1, 'w

In [126]:
print("Best parameters:", study.best_params)
print("Best R2:", study.best_value)

Best parameters: {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}
Best R2: 0.8545011782769718


In [127]:
pipeline.set_params(
model__n_neighbors=study.best_params['n_neighbors'],
model__p=study.best_params['p'],
model__weights=study.best_params['weights'],
model__n_jobs = -1
)

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)

y_pred = pipeline.predict(X_test)

y_data = pipeline.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

Train Set:
MSE: 0.0
MAE: 0.0
R2: 1.0

Test Set:
MSE: 6.881855912001691
MAE: 1.896911083589994
R2: 0.8562366106114225

Data Set:
MSE: 2.2942076069262893
MAE: 0.6323741579717465
R2: 0.952515882948851
