# **Random Forest**

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error as mae
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor
from scipy.stats import pearsonr

In [2]:
# Đọc dữ liệu từ file excel: 1p
data9 = pd.read_excel('drive/MyDrive/data_match9.xlsx', engine='openpyxl')
data10 = pd.read_excel('drive/MyDrive/data_match10.xlsx', engine='openpyxl')

data9.dropna(inplace=True)
data10.dropna(inplace=True)

print('Training...')

Training...


In [3]:
X_9 = data9.iloc[:, 6: -1].values
y_9 = data9.iloc[:, 4].values

X_10 = data10.iloc[:, 6: -1].values
y_10 = data10.iloc[:, 4].values

X = np.concatenate((X_9, X_10), axis=0)
y = np.concatenate((y_9, y_10), axis=0)

# Chia dữ liệu thành 2 phần: tập huấn luyện và tập kiểm tra
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(129548, 18)
(129548,)
(32388, 18)
(32388,)


In [4]:
#Test RF model: 30p
test_model = RandomForestRegressor(n_estimators = 500, max_depth = 20, 
                                   min_samples_split = 5, min_samples_leaf = 2, random_state=42)
test_model.fit(X_train, y_train)

print('Training...')
predicted_train_test = test_model.predict(X_train)
print('Train R2 score:', test_model.score(X_train, y_train))
mse = mean_squared_error(y_train, predicted_train_test)
print("Train RMSE:", np.sqrt(mse))
corr_train_test, p_value_train_test = pearsonr(y_train, predicted_train_test)
print('Train MAE:', mae(predicted_train_test, y_train))
print('Train Pearson r:', corr_train_test, p_value_train_test)


print('Validating...')
predicted_val_test = test_model.predict(X_val)
print('Validate R2 score:', test_model.score(X_val, y_val))
mse = mean_squared_error(y_val, predicted_val_test)
print("Validate RMSE:", np.sqrt(mse))
corr_val_test, p_value_val_test = pearsonr(y_val, predicted_val_test)
print('Validate MAE:', mae(predicted_val_test, y_val))
print('Validate Pearson r:', corr_val_test, p_value_val_test)


Training...
Train R2 score: 0.704008843137194
Train RMSE: 1.292441657737026
Train MAE: 0.30121781698447925
Train Pearson r: 0.8743976804586243 0.0
Validating...
Validate R2 score: 0.2611384362149086
Validate RMSE: 1.9958515212932502
Validate MAE: 0.4710894378126883
Validate Pearson r: 0.5126464144148603 0.0


# **Tối ưu tham số + grid-search**

In [4]:
#Runtime: 4h
# Xây dựng mô hình Random Forest
rf_model = RandomForestRegressor(random_state=42)

# Define hyperparameters grid
param_grid = {
    'n_estimators': [50, 150],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Apply Grid Search
grid_search = GridSearchCV(
    estimator=rf_model, param_grid=param_grid, cv=5, n_jobs= -1 )
grid_search.fit(X_train, y_train)
print("Best parameters: ", grid_search.best_params_)

best_rf = RandomForestRegressor(**grid_search.best_params_, random_state=42)
best_rf.fit(X_train, y_train)
#Đánh giá trên training set
print('Training...')
predicted_train = best_rf.predict(X_train)
print('Train R2 score:', best_rf.score(X_train, y_train))
mse = mean_squared_error(y_train, predicted_train)
print("Train RMSE:", np.sqrt(mse))
corr_train, p_value_train = pearsonr(y_train, predicted_train)
print('Train MAE:', mae(predicted_train, y_train))
print('Train Pearson r:', corr_train, p_value_train)


print('Validating...')
# Validation set
predicted_val = best_rf.predict(X_val)
print('Validate R2 score:', best_rf.score(X_val, y_val))
mse = mean_squared_error(y_val, predicted_val)
print("Validate RMSE:", np.sqrt(mse))
corr_val, p_value_val = pearsonr(y_val, predicted_val)
print('Validate MAE:', mae(predicted_val, y_val))
print('Validate Pearson r:', corr_val, p_value_val)




Best parameters:  {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Training...
Train R2 score: 0.719485716424831
Train RMSE: 1.2581982293244987
Train MAE: 0.2935161782344026
Train Pearson r: 0.8820962815534288 0.0
Validating...
Validate R2 score: 0.2620227458609248
Validate RMSE: 1.9946567919607732
Validate MAE: 0.4696412617253077
Validate Pearson r: 0.5140331212798706 0.0


In [5]:
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}


In [6]:
test_model = None
grid_search = None
model = None
rf_model = None
best_rf = None
print()


