In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
df=pd.read_csv('/content/sample_data/reduced_selected_features.csv')
X=df.drop('target',axis=1)
y=df['target']


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Define the parameter space
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create model
rf = RandomForestClassifier(random_state=42)

# RandomizedSearch
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=20, cv=5, verbose=2, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

print("Best Parameters (RandomizedSearchCV):", random_search.best_params_)
print("Best Score:", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters (RandomizedSearchCV): {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}
Best Score: 0.8181737588652481


In [10]:
# Define grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # required for l1 penalty
}

# Create model
log_reg = LogisticRegression(random_state=42)

grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters (GridSearchCV):", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters (GridSearchCV): {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score: 0.8433510638297872


In [12]:
# Baseline model
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)
baseline_pred = baseline_rf.predict(X_test)
baseline_accuracy = accuracy_score(y_test, baseline_pred)

# Optimized model
best_rf = random_search.best_estimator_
optimized_pred = best_rf.predict(X_test)
optimized_accuracy = accuracy_score(y_test, optimized_pred)

print(f"Baseline Accuracy: {baseline_accuracy:.3f}")
print(f"Optimized Accuracy: {optimized_accuracy:.3f}")

print(classification_report(y_test, optimized_pred))

Baseline Accuracy: 0.783
Optimized Accuracy: 0.817
              precision    recall  f1-score   support

           0       0.78      0.91      0.84        32
           1       0.87      0.71      0.78        28

    accuracy                           0.82        60
   macro avg       0.83      0.81      0.81        60
weighted avg       0.82      0.82      0.81        60



In [13]:
import joblib
joblib.dump(best_rf, 'best_rf_model.pkl')

['best_rf_model.pkl']