In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna

In [46]:
df = pd.read_csv('data.csv')

In [47]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

In [48]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 5, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        max_features=trial.suggest_categorical('max_features', [ 'sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )
    
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=5, scoring='r2')
    
    return score.mean()


study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=100)  


print(f'Best parameters: {study.best_params}')


best_rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features'],
    random_state=42,
    n_jobs=-1
)

best_rf.fit(X_train, y_train)


y_train_pred = best_rf.predict(X_train)
y_pred = best_rf.predict(X_test)
y_data = best_rf.predict(X)

print('Train Set:')
print(f'MSE: {mean_squared_error(y_train, y_train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'R2: {r2_score(y_train, y_train_pred)}')

print()

print('Test Set:')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

print()

print('Data Set:')
print(f'MSE: {mean_squared_error(y, y_data)}')
print(f'MAE: {mean_absolute_error(y, y_data)}')
print(f'R2: {r2_score(y, y_data)}')

[I 2024-12-23 02:17:35,510] A new study created in memory with name: no-name-625ccd21-28d6-480a-9530-5e9155b4c207
[I 2024-12-23 02:17:38,602] Trial 0 finished with value: 0.9507404797855212 and parameters: {'n_estimators': 149, 'max_depth': 13, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.9507404797855212.
[I 2024-12-23 02:17:41,739] Trial 1 finished with value: 0.9512689773176672 and parameters: {'n_estimators': 178, 'max_depth': 17, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 1 with value: 0.9512689773176672.
[I 2024-12-23 02:17:44,350] Trial 2 finished with value: 0.9525801346544535 and parameters: {'n_estimators': 115, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 2 with value: 0.9525801346544535.
[I 2024-12-23 02:17:46,427] Trial 3 finished with value: 0.9504297531330061 and parameters: {'n_estimators': 58, 'max_depth': 13, 'm

Best parameters: {'n_estimators': 167, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Train Set:
MSE: 0.3144742303103134
MAE: 0.40609243062393624
R2: 0.9932967416825504

Test Set:
MSE: 1.9981260004023158
MAE: 1.0465039346278169
R2: 0.9568939510867791

Data Set:
MSE: 0.8757543016314875
MAE: 0.6195868247898726
R2: 0.9812585444104432
