# Lab 3 â€“ Improved & Cleaned Notebook

This notebook is rewritten for **better accuracy, correctness, and reproducibility**.

## 1. Imports

In [None]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from imblearn.over_sampling import SMOTE

import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns


## 2. Load Dataset

In [None]:

df = pd.read_csv("data.csv")  # change filename if needed
X = df.drop('target', axis=1)
y = df['target']

print(df.shape)
df.head()


## 3. Train-Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


## 4. Handle Class Imbalance (SMOTE)

In [None]:

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_bal.value_counts())


## 5. Models with Pipelines

In [None]:

models = {}

models['Logistic Regression'] = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

models['SVM'] = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(kernel='rbf', C=10, gamma='scale',
                class_weight='balanced', probability=True))
])

models['KNN'] = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier(n_neighbors=7))
])

models['Random Forest'] = RandomForestClassifier(
    n_estimators=300,
    class_weight='balanced',
    random_state=42
)

models['XGBoost'] = xgb.XGBClassifier(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)


## 6. Train & Evaluate

In [None]:

results = {}

for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1  = f1_score(y_test, preds, average='macro')

    results[name] = {'accuracy': acc, 'f1': f1}

    print(f"\n{name}")
    print("Accuracy:", acc)
    print("Macro F1:", f1)


## 7. Model Ranking (Macro F1)

In [None]:

sorted_results = sorted(results.items(), key=lambda x: x[1]['f1'], reverse=True)

for model, scores in sorted_results:
    print(model, scores)


## 8. Confusion Matrix (Best Model)

In [None]:

best_model_name = sorted_results[0][0]
best_model = models[best_model_name]

y_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(best_model_name)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


## 9. Soft Voting Ensemble

In [None]:

ensemble = VotingClassifier(
    estimators=[
        ('svm', models['SVM']),
        ('rf', models['Random Forest']),
        ('xgb', models['XGBoost'])
    ],
    voting='soft',
    weights=[2, 3, 3]
)

ensemble.fit(X_train_bal, y_train_bal)
ensemble_preds = ensemble.predict(X_test)

print("Ensemble Accuracy:", accuracy_score(y_test, ensemble_preds))
print("Ensemble Macro F1:", f1_score(y_test, ensemble_preds, average='macro'))
