In [13]:
import pandas as pd
data=pd.read_csv('../data/processed_data.csv')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.metrics import accuracy_score

In [7]:
X=data.drop('target',axis=1)
y=data['target']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

Baseline model

In [8]:
data_base = RandomForestClassifier(random_state=42)
data_base.fit(X_train, y_train)
y_pred=data_base.predict(X_test)
print("Baseline Model Performance:")
print(classification_report(y_test,y_pred))

Baseline Model Performance:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85        37
           1       0.00      0.00      0.00         7
           2       0.25      0.14      0.18         7
           3       0.12      0.25      0.17         4
           4       0.00      0.00      0.00         5

    accuracy                           0.60        60
   macro avg       0.23      0.26      0.24        60
weighted avg       0.53      0.60      0.56        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


GridSearch

In [10]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("GridSearch : ", grid_search.best_params_)

y_pred_grid=grid_search.predict(X_test)
print("GridSearch Model Performance : ")
print(classification_report(y_test, y_pred_grid))

GridSearch :  {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
GridSearch Model Performance : 
              precision    recall  f1-score   support

           0       0.79      0.92      0.85        37
           1       0.00      0.00      0.00         7
           2       0.50      0.14      0.22         7
           3       0.14      0.25      0.18         4
           4       0.00      0.00      0.00         5

    accuracy                           0.60        60
   macro avg       0.29      0.26      0.25        60
weighted avg       0.56      0.60      0.56        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


RandomizedSearchCV

In [16]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

random_search=RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("RandomizedSearch : ", random_search.best_params_)
y_pred_random = random_search.predict(X_test)
print("RandomizedSearch Model Performance:")
print(classification_report(y_test, y_pred_random))

RandomizedSearch :  {'n_estimators': np.int64(50), 'min_samples_split': 5, 'max_depth': None}
RandomizedSearch Model Performance:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        37
           1       0.08      0.14      0.11         7
           2       0.25      0.14      0.18         7
           3       0.17      0.25      0.20         4
           4       0.00      0.00      0.00         5

    accuracy                           0.57        60
   macro avg       0.26      0.27      0.26        60
weighted avg       0.55      0.57      0.56        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Compare optimized models with baseline performance

In [17]:
baseline_acc = accuracy_score(y_test,y_pred)
grid_acc = accuracy_score(y_test, y_pred_grid)
random_acc = accuracy_score(y_test, y_pred_random)

print(f"Baseline Accuracy: {baseline_acc:.4f}")
print(f"GridSearchCV Accuracy: {grid_acc:.4f}")
print(f"RandomizedSearchCV Accuracy: {random_acc:.4f}")

Baseline Accuracy: 0.6000
GridSearchCV Accuracy: 0.6000
RandomizedSearchCV Accuracy: 0.5667


In [18]:
import pickle
import os

temp_dir = "../temp"
os.makedirs(temp_dir, exist_ok=True)

temp_path = os.path.join(temp_dir, "grid_search_full.pkl")

with open(temp_path, "wb") as file:
    pickle.dump(grid_search, file)

print(f"{temp_path}")


../temp\grid_search_full.pkl
