In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('median_data.csv')

In [3]:
X = df.drop(['overall_rating'], axis=1)
y = df['overall_rating']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# Train Random Forest với các tham số mặc định

In [5]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [6]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

## Đánh giá model RF (tham số mặc định)

## Hold out

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('Training Set:')
print('MAE:', mean_absolute_error(y_train, y_train_pred))
print('MSE:', mean_squared_error(y_train, y_train_pred))
print('R2 score:', r2_score(y_train, y_train_pred))

print()

print('Test Set:')
print('MAE:', mean_absolute_error(y_test, y_test_pred))
print('MSE:', mean_squared_error(y_test, y_test_pred))
print('R2 score:', r2_score(y_test, y_test_pred))

Training Set:
MAE: 0.102755767301906
MSE: 0.03314534436643265
R2 score: 0.999317080487466

Test Set:
MAE: 0.27737422697643344
MSE: 0.24871644659869635
R2 score: 0.9948042621326386


## Cross validation

In [8]:
from sklearn.model_selection import cross_val_score

# Định nghĩa số lượng folds cho cross-validation
cv = 10

# Sử dụng cross_val_score để tính toán R^2 score
r2_scores = cross_val_score(rf, X, y, cv=cv, scoring='r2', n_jobs=-1)
print(f"R^2 scores cho {cv}-fold cross-validation: {r2_scores}")
print(f"R^2 trung bình: {r2_scores.mean():.3f} ± {r2_scores.std():.3f}")
print()

# Sử dụng cross_val_score để tính toán MSE (sử dụng scoring 'neg_mean_squared_error')
mse_scores = cross_val_score(rf, X, y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
# Chuyển đổi MSE về giá trị dương
mse_scores = -mse_scores
print(f"MSE cho {cv}-fold cross-validation: {mse_scores}")
print(f"MSE trung bình: {mse_scores.mean():.3f}" + u"\u00B1" + f" {mse_scores.std():.3f}")
print()

# Đánh giá Mean Absolute Error (MAE)
mae_scores = cross_val_score(rf, X, y, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
mae_scores = -mae_scores
print(f"MAE cho {cv}-fold cross-validation: {mae_scores}")
print(f"MAE trung bình: {mae_scores.mean():.3f} ± {mae_scores.std():.3f}")

R^2 scores cho 10-fold cross-validation: [0.98903599 0.94905753 0.70096649 0.8370208  0.79427809 0.46266735
 0.45950641 0.63101632 0.81519046 0.95517968]
R^2 trung bình: 0.759 ± 0.183

MSE cho 10-fold cross-validation: [0.1744049  0.17244089 5.21817253 0.54837471 0.20174357 0.27538602
 0.25786981 0.23099744 0.19533272 0.52748484]
MSE trung bình: 0.780± 1.485

MAE cho 10-fold cross-validation: [0.20978273 0.23991086 1.47485794 0.54447911 0.31222841 0.35348747
 0.32076323 0.26149387 0.20009476 0.43877369]
MAE trung bình: 0.436 ± 0.361


# Thử tối ưu các tham số trong RF

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 8, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X, y)
print("Best parameters found: ", grid_search.best_params_)


Best parameters found:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [16]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=1, min_samples_split=5, random_state=42)
rf.fit(X_train, y_train)

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('Training Set:')
print('MAE:', mean_absolute_error(y_train, y_train_pred))
print('MSE:', mean_squared_error(y_train, y_train_pred))
print('R2 score:', r2_score(y_train, y_train_pred))

print()

print('Test Set:')
print('MAE:', mean_absolute_error(y_test, y_test_pred))
print('MSE:', mean_squared_error(y_test, y_test_pred))
print('R2 score:', r2_score(y_test, y_test_pred))

Training Set:
MAE: 0.102755767301906
MSE: 0.03314534436643265
R2 score: 0.999317080487466

Test Set:
MAE: 0.27737422697643344
MSE: 0.24871644659869635
R2 score: 0.9948042621326386
