# Ranom Forest

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import data_preprocess as dp
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import joblib

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Get Data

In [2]:
# read data from csv file
x_train, y_train = dp.load_training_data() 
x_test, y_test = dp.load_test_data()  

model = RandomForestClassifier()

# Feature Extraction

In [3]:
# feature encoding
pca_model = PCA(n_components=8).fit(x_train)  
trainpca = pca_model.transform(x_train)        
testpca = pca_model.transform(x_test)

# Parameters Grid

In [4]:
# grid search for random forest
param_grid = {
    'n_estimators': [50, 100, 200],       
    'max_depth': [10, 50, 100],          
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 4],        
    'max_features': ['sqrt', 'log2'],     
    'bootstrap': [True, False]            
}

# Grid Search

In [5]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)  #grid search
grid_search.fit(x_train, y_train)


Fitting 2 folds for each of 324 candidates, totalling 648 fits
Best parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


Fitting 2 folds for each of 324 candidates, totalling 648 fits
Best parameters: {'bootstrap': True, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

# Best Model

In [None]:

best_model = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)

# Best Model Prediction Accuracy

In [6]:
acc = []
d = {}
clf = best_model

clf.fit(x_train, y_train)
pred = clf.predict(x_test)
f1 = f1_score(y_test, pred, average='weighted')    #calculate f1 score
acc.append(accuracy_score(pred, y_test))
d = {'Modelling Algo': 'Best model', 'Accuracy': acc}
print(d)

{'Modelling Algo': 'Best model', 'Accuracy': [0.70625]}


Fitting 2 folds for each of 324 candidates, totalling 648 fits
Best parameters: {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
{'Modelling Algo': 'Best model', 'Accuracy': [0.7166666666666667], 'F1': 0.7061601970223739}