In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [27]:
df = pd.read_csv('data.csv')

In [28]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [29]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 5, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        max_features=trial.suggest_categorical('max_features', [ 'sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )
    
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=3, scoring='r2')
    
    return score.mean()


study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=1000)  


print(f'Best parameters: {study.best_params}')


best_rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features'],
    random_state=42,
    n_jobs=-1
)

best_rf.fit(X_train, y_train)


y_train_pred = best_rf.predict(X_train)
y_pred = best_rf.predict(X_test)
y_data = best_rf.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

[I 2024-12-18 23:23:01,759] A new study created in memory with name: no-name-d44ed664-3936-454e-a694-aeed191f3d95
[I 2024-12-18 23:23:02,719] Trial 0 finished with value: 0.9503690653909577 and parameters: {'n_estimators': 143, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.9503690653909577.
[I 2024-12-18 23:23:04,017] Trial 1 finished with value: 0.9523799884938228 and parameters: {'n_estimators': 140, 'max_depth': 23, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9523799884938228.
[I 2024-12-18 23:23:04,408] Trial 2 finished with value: 0.9433250144655747 and parameters: {'n_estimators': 52, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9523799884938228.
[I 2024-12-18 23:23:05,226] Trial 3 finished with value: 0.9500534253804139 and parameters: {'n_estimators': 109, 'max_depth': 12, 'min

Best parameters: {'n_estimators': 198, 'max_depth': 29, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Train Set:
MSE: 0.3127749459709109
MAE: 0.4056413177172998
R2: 0.9933329632256339

Test Set:
MSE: 1.9909482750859957
MAE: 1.0474439980789738
R2: 0.9570487978674669

Data Set:
MSE: 0.8722286659070327
MAE: 0.6195994892386213
R2: 0.981333994277183
