In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [22]:
df = pd.read_csv('data.csv')

In [23]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [24]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 5, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        max_features=trial.suggest_categorical('max_features', [ 'sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )
    
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=3, scoring='r2')
    
    return score.mean()


study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=100)  


print(f'Best parameters: {study.best_params}')


best_rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features'],
    random_state=42,
    n_jobs=-1
)

best_rf.fit(X_train, y_train)


y_train_pred = best_rf.predict(X_train)
y_pred = best_rf.predict(X_test)
y_data = best_rf.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

[I 2024-12-22 23:37:55,121] A new study created in memory with name: no-name-096fe60b-4fa7-41d8-aab5-0792d25f1340
[I 2024-12-22 23:37:56,399] Trial 0 finished with value: 0.9484521976117715 and parameters: {'n_estimators': 120, 'max_depth': 13, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.9484521976117715.
[I 2024-12-22 23:37:57,624] Trial 1 finished with value: 0.9503585561653795 and parameters: {'n_estimators': 186, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.9503585561653795.
[I 2024-12-22 23:37:58,653] Trial 2 finished with value: 0.9515652687623927 and parameters: {'n_estimators': 110, 'max_depth': 22, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9515652687623927.
[I 2024-12-22 23:37:59,738] Trial 3 finished with value: 0.9520202563471404 and parameters: {'n_estimators': 124, 'max_depth': 19, '

Best parameters: {'n_estimators': 183, 'max_depth': 22, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Train Set:
MSE: 0.3177126615309709
MAE: 0.41053365803798914
R2: 0.9932277120485676

Test Set:
MSE: 1.99740988693958
MAE: 1.0473758489164404
R2: 0.9569093999733588

Data Set:
MSE: 0.8776744037526445
MAE: 0.6228381479961812
R2: 0.9812174535376219
