In [15]:
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
df=pd.read_csv("/content/reduced_dataset.csv")

In [17]:
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

In [18]:
X = df.drop('num', axis=1)
y = df['num']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True)
}

baseline_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    baseline_results[name] = accuracy_score(y_test, y_pred)

print("Baseline Accuracy:")
print(baseline_results)

Baseline Accuracy:
{'Logistic Regression': 0.9166666666666666, 'Decision Tree': 0.7666666666666667, 'Random Forest': 0.8833333333333333, 'SVM': 0.9333333333333333}


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print("Best Parameters (GridSearch):", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters (GridSearch): {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.8101950354609929


In [21]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    'C': uniform(0.1, 10),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

random_search = RandomizedSearchCV(
    estimator=SVC(probability=True),
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
print("Best Parameters (RandomizedSearch):", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters (RandomizedSearch): {'C': np.float64(7.41993941811405), 'gamma': 'scale', 'kernel': 'linear'}
Best Score: 0.8271276595744681


In [23]:
best_rf = grid_search.best_estimator_
best_svm = random_search.best_estimator_

models_optimized = {
    'Random Forest (Tuned)': best_rf,
    'SVM (Tuned)': best_svm
}

optimized_results = {}

for name, model in models_optimized.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    optimized_results[name] = accuracy_score(y_test, y_pred)

print("\nOptimized Accuracy:")
print(optimized_results)


Optimized Accuracy:
{'Random Forest (Tuned)': 0.8833333333333333, 'SVM (Tuned)': 0.8833333333333333}


In [25]:
import joblib

best_model = RandomForestClassifier(max_depth=5, min_samples_leaf=2,
                                    min_samples_split=2, n_estimators=200,
                                    random_state=42)
best_model.fit(X_train, y_train)

joblib.dump(best_model, 'best_model.pkl')

print("Model saved successfully as best_model.pkl")

Model saved successfully as best_model.pkl
