# Ranom Forest

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import data_preprocess as dp
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import joblib

# Load Data

In [10]:
# load data
x_train, y_train = dp.load_training_data() 
x_test, y_test = dp.load_test_data()  

model = RandomForestClassifier()

# Feature Extraction

In [11]:
# use pca to reduce dimension
pca_model = PCA(n_components=8).fit(x_train)  
trainpca = pca_model.transform(x_train)        
testpca = pca_model.transform(x_test)

# Parameters Grid

In [12]:
# set hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],       
    'max_depth': [10, 50, 100],          
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 4],        
    'max_features': ['sqrt', 'log2'],     
    'bootstrap': [True, False]            
}

# Grid Search

In [13]:
# use 5-fold cross validation to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


# Best Model

In [14]:
# save the best hyperparameters
best_model = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)

Best parameters: {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


# Best Model Prediction Accuracy

In [83]:
accuracy_list = []
model_accuracy_dict = {}
classifier = best_model

# Fit the model on the training data
classifier.fit(x_train, y_train)

# Predict the labels of the test set
predictions = classifier.predict(x_test)

# calculate the accuracy and f1 score of the model
f1 = f1_score(y_test, predictions, average='weighted')
accuracy_list.append(accuracy_score(predictions, y_test))
model_accuracy_dict = {'Model': 'Best Model', 'Accuracy': accuracy_list}

# Print the model and its accuracy in a readable format
for key, value in model_accuracy_dict.items():
    print(f"{key}: {value}")

Model: Best Model
Accuracy: [0.7229166666666667]
