In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [2]:
df = pd.read_csv('data.csv')

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 5, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        max_features=trial.suggest_categorical('max_features', [ 'sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )
    
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=5, scoring='r2')
    
    return score.mean()


study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=100)  

[I 2024-12-23 20:08:20,650] A new study created in memory with name: no-name-2abeeb3a-5c2e-42ea-b852-f004c57cada2
[I 2024-12-23 20:08:26,510] Trial 0 finished with value: 0.9142536863672618 and parameters: {'n_estimators': 138, 'max_depth': 27, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9142536863672618.
[I 2024-12-23 20:08:29,122] Trial 1 finished with value: 0.8664676253903322 and parameters: {'n_estimators': 147, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 0.9142536863672618.
[I 2024-12-23 20:08:32,368] Trial 2 finished with value: 0.9123058758431931 and parameters: {'n_estimators': 64, 'max_depth': 24, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9142536863672618.
[I 2024-12-23 20:08:34,717] Trial 3 finished with value: 0.8969536016431444 and parameters: {'n_estimators': 95, 'max_depth': 10, 'min_

Best parameters: {'n_estimators': 180, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Train Set:
MSE: 0.7230200040076042
MAE: 0.5296658791226523
R2: 0.9851593989308055

Test Set:
MSE: 3.812863062103083
MAE: 1.2603145449411957
R2: 0.9196872430067372

Data Set:
MSE: 1.75308246662449
MAE: 0.7732425753722952
R2: 0.9637157627783122


In [None]:
print(f'Best parameters: {study.best_params}')
print("Best R2:", study.best_value)

In [None]:
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features'],
    random_state=42,
    n_jobs=-1
)

best_rf.fit(X_train, y_train)


y_train_pred = best_rf.predict(X_train)
y_pred = best_rf.predict(X_test)
y_data = best_rf.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')