In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [42]:
df = pd.read_csv('data.csv')

In [43]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [44]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 5, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        max_features=trial.suggest_categorical('max_features', [ 'sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )
    
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=3, scoring='r2')
    
    return score.mean()


study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=100)  


print(f'Best parameters: {study.best_params}')


best_rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features'],
    random_state=42,
    n_jobs=-1
)

best_rf.fit(X_train, y_train)


y_train_pred = best_rf.predict(X_train)
y_pred = best_rf.predict(X_test)
y_data = best_rf.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

[I 2024-12-23 00:54:47,056] A new study created in memory with name: no-name-4fe82050-c3d2-42f9-b5bc-34f1505ebbe3
[I 2024-12-23 00:54:48,934] Trial 0 finished with value: 0.951475375403367 and parameters: {'n_estimators': 53, 'max_depth': 26, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.951475375403367.
[I 2024-12-23 00:54:51,198] Trial 1 finished with value: 0.953549910319396 and parameters: {'n_estimators': 132, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.953549910319396.
[I 2024-12-23 00:54:53,156] Trial 2 finished with value: 0.9515787201920901 and parameters: {'n_estimators': 169, 'max_depth': 26, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 1 with value: 0.953549910319396.
[I 2024-12-23 00:54:54,612] Trial 3 finished with value: 0.9506604881959557 and parameters: {'n_estimators': 91, 'max_depth': 29, 'min_samp

Best parameters: {'n_estimators': 195, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Train Set:
MSE: 0.2904880582221212
MAE: 0.3979150511231063
R2: 0.9938080252538475

Test Set:
MSE: 1.9819185635497607
MAE: 1.0423233622902468
R2: 0.9572435979887179

Data Set:
MSE: 0.8543613315019215
MAE: 0.6127418634585843
R2: 0.981716361630256
