In [None]:
!pip install pandas numpy scikit-learn imbalanced-learn xgboost lightgbm catboost matplotlib seaborn


In [None]:
# ==============================================
# 1. Import Required Libraries
# ==============================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")


In [None]:
# ==== Modify These ====
file_path = 'dataset.csv'      # Your dataset file
target_column = 'target'       # Your target column name

# Load dataset
df = pd.read_csv(file_path)
print("Original shape:", df.shape)

# Encode categorical variables
for col in df.select_dtypes(include='object').columns:
    if col != target_column:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    else:
        df[col] = df[col].astype(str)

# Encode target if needed
if df[target_column].dtype == 'object':
    df[target_column] = LabelEncoder().fit_transform(df[target_column])

# Features and target
X = df.drop(columns=target_column)
y = df[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.fit_transform(X)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
X_res, y_res = smote.fit_resample(X_scaled, y)


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
model_lr = LogisticRegression()
cv_lr = cross_val_score(model_lr, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_lr.fit(X_train_res, y_train_res)
y_pred_lr = model_lr.predict(X_test_scaled)

print("\n🔸 Logistic Regression")
print("CV Accuracy:", cv_lr)
print("Test Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

In [None]:
model_dt = DecisionTreeClassifier()
cv_dt = cross_val_score(model_dt, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_dt.fit(X_train_res, y_train_res)
y_pred_dt = model_dt.predict(X_test_scaled)

print("\n🔸 Decision Tree")
print("CV Accuracy:", cv_dt)
print("Test Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))


In [None]:
model_rf = RandomForestClassifier()
cv_rf = cross_val_score(model_rf, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_rf.fit(X_train_res, y_train_res)
y_pred_rf = model_rf.predict(X_test_scaled)

print("\n🔸 Random Forest")
print("CV Accuracy:", cv_rf)
print("Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


In [None]:
model_gb = GradientBoostingClassifier()
cv_gb = cross_val_score(model_gb, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_gb.fit(X_train_res, y_train_res)
y_pred_gb = model_gb.predict(X_test_scaled)

print("\n🔸 Gradient Boosting")
print("CV Accuracy:", cv_gb)
print("Test Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

In [None]:
model_svm = SVC()
cv_svm = cross_val_score(model_svm, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_svm.fit(X_train_res, y_train_res)
y_pred_svm = model_svm.predict(X_test_scaled)

print("\n🔸 SVM")
print("CV Accuracy:", cv_svm)
print("Test Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


In [None]:
model_nb = GaussianNB()
cv_nb = cross_val_score(model_nb, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_nb.fit(X_train_res, y_train_res)
y_pred_nb = model_nb.predict(X_test_scaled)

print("\n🔸 Naive Bayes")
print("CV Accuracy:", cv_nb)
print("Test Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))


In [None]:
model_knn = KNeighborsClassifier()
cv_knn = cross_val_score(model_knn, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_knn.fit(X_train_res, y_train_res)
y_pred_knn = model_knn.predict(X_test_scaled)

print("\n🔸 KNN")
print("CV Accuracy:", cv_knn)
print("Test Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))


In [None]:
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
cv_xgb = cross_val_score(model_xgb, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_xgb.fit(X_train_res, y_train_res)
y_pred_xgb = model_xgb.predict(X_test_scaled)

print("\n🔸 XGBoost")
print("CV Accuracy:", cv_xgb)
print("Test Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


In [None]:
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
cv_mlp = cross_val_score(model_mlp, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_mlp.fit(X_train_res, y_train_res)
y_pred_mlp = model_mlp.predict(X_test_scaled)

print("\n🔸 MLP (Neural Network)")
print("CV Accuracy:", cv_mlp)
print("Test Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mlp))
print("Classification Report:\n", classification_report(y_test, y_pred_mlp))


In [None]:
model_lgb = LGBMClassifier()
cv_lgb = cross_val_score(model_lgb, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_lgb.fit(X_train_res, y_train_res)
y_pred_lgb = model_lgb.predict(X_test_scaled)

print("\n🔸 LightGBM")
print("CV Accuracy:", cv_lgb)
print("Test Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgb))
print("Classification Report:\n", classification_report(y_test, y_pred_lgb))


In [None]:
model_cat = CatBoostClassifier(verbose=0)
cv_cat = cross_val_score(model_cat, X_res, y_res, cv=cv, scoring='accuracy').mean()

model_cat.fit(X_train_res, y_train_res)
y_pred_cat = model_cat.predict(X_test_scaled)

print("\n🔸 CatBoost")
print("CV Accuracy:", cv_cat)
print("Test Accuracy:", accuracy_score(y_test, y_pred_cat))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cat))
print("Classification Report:\n", classification_report(y_test, y_pred_cat))


In [None]:
accuracy_summary = {
    "Logistic Regression": cv_lr,
    "Decision Tree": cv_dt,
    "Random Forest": cv_rf,
    "Gradient Boosting": cv_gb,
    "SVM": cv_svm,
    "Naive Bayes": cv_nb,
    "KNN": cv_knn,
    "XGBoost": cv_xgb,
    "MLP": cv_mlp,
    "LightGBM": cv_lgb,
    "CatBoost": cv_cat
}

plt.figure(figsize=(12,6))
sns.barplot(x=list(accuracy_summary.keys()), y=list(accuracy_summary.values()))
plt.xticks(rotation=45)
plt.ylabel("Cross-Validation Accuracy")
plt.title("5-Fold CV Accuracy Comparison (After SMOTE)")
plt.tight_layout()
plt.show()
