In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [7]:
df = pd.read_csv('data.csv')

In [8]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [9]:
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('model', RandomForestRegressor(random_state=42))
])
pipeline.fit(X_train, y_train)

In [10]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):

    n_estimators=trial.suggest_int('n_estimators', 50, 200)
    max_depth=trial.suggest_int('max_depth', 5, 30)
    min_samples_split=trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5)
    max_features=trial.suggest_categorical('max_features', [ 'sqrt', 'log2'])
    
    pipeline.set_params(
        model__n_estimators=n_estimators,
        model__max_depth=max_depth,
        model__min_samples_split=min_samples_split,
        model__min_samples_leaf=min_samples_leaf,
        model__max_features=max_features,
        model__random_state=42,
        model__n_jobs = -1
    )
    
    score = cross_val_score(pipeline, X_train, y_train, n_jobs=-1, cv=5, scoring='r2')
    
    return score.mean()


study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=100)  

[I 2024-12-24 01:19:10,102] A new study created in memory with name: no-name-3c992034-d8be-4e61-8b84-6f3ab06ad9cd
[I 2024-12-24 01:19:12,126] Trial 0 finished with value: 0.909022235918903 and parameters: {'n_estimators': 83, 'max_depth': 29, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.909022235918903.
[I 2024-12-24 01:19:13,931] Trial 1 finished with value: 0.8582769213661725 and parameters: {'n_estimators': 134, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.909022235918903.
[I 2024-12-24 01:19:17,840] Trial 2 finished with value: 0.9118564810499311 and parameters: {'n_estimators': 179, 'max_depth': 30, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9118564810499311.
[I 2024-12-24 01:19:18,342] Trial 3 finished with value: 0.7913680108463951 and parameters: {'n_estimators': 84, 'max_depth': 5, 'min_samp

In [11]:
print(f'Best parameters: {study.best_params}')
print("Best R2:", study.best_value)

Best parameters: {'n_estimators': 181, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best R2: 0.9147598036012414


In [12]:
pipeline.set_params(
    model__n_estimators=study.best_params['n_estimators'],
    model__max_depth=study.best_params['max_depth'],
    model__min_samples_split=study.best_params['min_samples_split'],
    model__min_samples_leaf=study.best_params['min_samples_leaf'],
    model__max_features=study.best_params['max_features'],
    model__random_state=42,
    model__n_jobs=-1
)

pipeline.fit(X_train, y_train)


y_train_pred = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test)
y_data = pipeline.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

Train Set:
MSE: 0.6464329353461279
MAE: 0.5017924386149845
R2: 0.9866810354959069

Test Set:
MSE: 3.878452426491901
MAE: 1.2650044133375238
R2: 0.9189783288774717

Data Set:
MSE: 1.723892823657554
MAE: 0.7562247807760673
R2: 0.9643199122977971
