In [2]:

# Random Forest Model Training, Tuning & Evaluation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


# Mount Google Drive
drive.mount('/content/drive')

# Define paths
results_path = '/content/drive/My Drive/results/'
final_path = results_path + 'final_preprocessed_dataset.csv'
model_vis_path = results_path + 'model_visualizations/'
os.makedirs(model_vis_path, exist_ok=True)


# Load final preprocessed dataset

df = pd.read_csv(final_path)
print("Dataset loaded successfully!")
print("Shape:", df.shape)

# Split features and target
X = df.drop(columns=["Target"])
y = df["Target"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Training set:", X_train.shape, " | Testing set:", X_test.shape)


# Random Forest Model (Hyperparameter Tuning)

rf = RandomForestClassifier(random_state=42)

# Grid Search for best hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)


# Model Evaluation

y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Random Forest Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(model_vis_path + "rf_confusion_matrix.png")
plt.close()


# Feature Importance Visualization

feature_importances = pd.Series(best_rf.feature_importances_, index=X.columns)
top_features = feature_importances.sort_values(ascending=False)[:10]

plt.figure(figsize=(10, 6))
sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')
plt.title("Top 10 Important Features - Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig(model_vis_path + "rf_feature_importance.png")
plt.close()


# Save model and results

model_path = results_path + "random_forest_model.pkl"
joblib.dump(best_rf, model_path)
print("Model saved to:", model_path)

# Save accuracy to table
results_table = pd.DataFrame({
    "Model": ["Random Forest"],
    "Accuracy": [accuracy]
})
results_table_path = results_path + "model_scores.csv"
results_table.to_csv(results_table_path, index=False)
print("Model accuracy saved to:", results_table_path)

print("\nAll done!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset loaded successfully!
Shape: (6295, 11)
Training set: (5036, 10)  | Testing set: (1259, 10)
Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

 Random Forest Accuracy: 0.7903

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.77      0.80       411
           1       0.73      0.75      0.74       431
           2       0.82      0.85      0.83       417

    accuracy                           0.79      1259
   macro avg       0.79      0.79      0.79      1259
weighted avg       0.79      0.79      0.79      1259




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')


Model saved to: /content/drive/My Drive/results/random_forest_model.pkl
Model accuracy saved to: /content/drive/My Drive/results/model_scores.csv

All done!
