# CIS4930 -- Final Project
## Developed by: Chloe Fandino (Team Leader), Ashley James, Madelyne Wirbel, Chloe Nolan, Christopher Enlow

## Data Preprocessing

### Imports

In [None]:
# imports here :)

# TODO: DELETE ---> any imports that don't end up getting used by the end of the project !!!!

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats.mstats import winsorize

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, plot_tree

from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches

from imblearn.over_sampling import SMOTE, SMOTENC

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

### Loading the data

In [None]:
df = pd.read_csv('OnlineNewsPopularity.csv') # import the data from the csv file --> convert to df

### Exploration of the dataset

In [None]:
# pd.set_option('display.max_rows', 100) # for purposes of looking at data --> need to see all rows

# basic intitial looks at the dataset
print(df.shape)

print(df.info())

In [None]:
# Strip whitespace from column names
df.columns = df.columns.str.strip()

In [None]:
print(df.columns.tolist()) # print out all of the available columns

In [None]:
df.isnull().sum() # null values? --> NONE :)

In [None]:
df.duplicated().sum() # duplicate values? --> NONE :)

In [None]:
# check if any infinities exist in the dataframe
numeric_df = df.select_dtypes(include=[np.number])

has_inf = np.isinf(numeric_df.to_numpy()).any()

print(has_inf) # will need to handle in cleaning
inf_cols = numeric_df.columns[np.isinf(numeric_df.to_numpy()).any(axis=0)].tolist()
print("Columns with inf:", inf_cols)

#### Visualization of the target variable --> shares

In [None]:
# histogram of shares
plt.figure(figsize=(8, 5))
plt.hist(df["shares"], bins=50)
plt.title("Distribution of Shares (Raw Scale)")
plt.xlabel("Shares")
plt.ylabel("Count")
plt.yscale("log")  # long tail
plt.tight_layout()
plt.show()

In [None]:
# boxplot of raw shares --> view of the outliers
sns.boxplot(x = df["shares"])
plt.title("Boxplot of Shares (Visualization of Outliers)")
plt.ylabel("Shares")
plt.show()

#### Visualizations of numerical features

In [None]:
# distribution of key numerical features
key_cols = ["n_tokens_title", "n_tokens_content", "num_imgs", "num_hrefs"]

for col in key_cols:
    plt.figure(figsize=(8, 5))
    plt.hist(df[col], bins=50)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

### Cleaning the data

In [None]:
# # exploration of potentially non-predictive features
df = df.drop(columns=['url', 'timedelta'], errors='ignore') # url is a title and number of days since posted until added to the dataset --> no predictive qualities

# feature engineering
df['rate_positive_words'] = df['global_rate_positive_words'] / (df['n_tokens_content'] + 1)
df['rate_negative_words'] = df['global_rate_negative_words'] / (df['n_tokens_content'] + 1)
df['emotional_polarity'] = df['global_sentiment_polarity'].abs()
df['title_body_sentiment_ratio'] = df['title_sentiment_polarity'] / (df['global_sentiment_polarity'] + 0.01)

In [None]:
# Splitting data in two --> based on median
median_shares = df['shares'].median()
print(f"Splitting data at median shares: {median_shares}")

def categorize(x):
    return 1 if x > median_shares else 0

df['y'] = df['shares'].apply(categorize)

# define X and y 
X = df.drop(columns=['shares', 'y'])
y = df['y']

binary_cols = [col for col in X.columns if "data_channel" in col or "weekday" in col or "is_weekend" in col]
# ensure binary cols are actually integers
for col in binary_cols:
    X[col] = X[col].astype(int)

numeric_cols = [col for col in X.columns if col not in binary_cols]

In [None]:
# --> test-train-split <-- DO NOT EDIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#CHLOE pearson correlation coefficient
corr_matrix = X_train[numeric_cols].corr(method='pearson').abs()

# visualization of highly correlated features
sns.heatmap(
    corr_matrix,
    cmap='coolwarm',
    annot=False,
    linewidths=0.3,
    cbar_kws={"shrink": 0.8},
    square=True
)
plt.title("Pearson Correlation Heatmap of Numeric Features (absolute value)")
plt.show()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]

print(f"Dropping {len(to_drop)} columns due to correlation: {to_drop}")

X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)
numeric_cols = [c for c in numeric_cols if c not in to_drop]

#### Anomaly Detection

In [None]:
# function to visualize and count anomalies
def anomaly_detection(feature, visualize):
    # first boxplot to see potential outliers
    if visualize:
        sns.boxplot(x = df[feature], color = 'purple')
        plt.title(feature)
        plt.show()

    # second calculate outliers based on IQR
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)

    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    anomalies = df[(df[feature] < lower) | (df[feature] > upper)]
    print('Anomalies: \n', anomalies) # prints a list of potential anomalies

    num_anomalies = anomalies.shape[0]
    return num_anomalies

In [None]:
# BEFORE OUTLIER HANDLING --> visualize outliers
cols_remaining = df.columns.tolist() # what columns are left in the dataset

num_anomalies_1 = []

# for each of the remaining columns print the anomalies and see if there needs to be any adjustments made --> generally high rates of anomalies
for col in cols_remaining:
    num = anomaly_detection(col, True)
    num_anomalies_1.append(num)

#### Scaling

In [None]:
#CHLOE handling skew and outliers
scaler = RobustScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# used later
X_train_original = X_train.copy()
X_test_original = X_test.copy()
y_train_original = y_train.copy()

#### Smote

In [None]:
#CHLOE TRAIN SMOTE NAN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

imputer = SimpleImputer(strategy='median')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

#CLASS BALANCING (SMOTE)
cat_indices = [X_train.columns.get_loc(c) for c in binary_cols if c in X_train.columns]
sm = SMOTENC(categorical_features=cat_indices, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

#### Feature selection

In [None]:
# CHLOE FEATURE SELECTION
selector_model = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42)
selector_model.fit(X_train_res, y_train_res)

selection = SelectFromModel(selector_model, threshold="1.25*median", prefit=True)

# Transform
X_train = pd.DataFrame(
    selection.transform(X_train_res),
    columns=X_train_res.columns[selection.get_support()]
)
X_test = pd.DataFrame(
    selection.transform(X_test),
    columns=X_train_res.columns[selection.get_support()]
)

y_train = y_train_res

selected_features_names = X_train.columns
print(f"Selected {len(selected_features_names)} features.")

## Training and Testing

In [None]:
# only need to declare the variable once
labels = ['Not Viral', 'Viral']

Note - grid search has been commented out to reduce runtime. The best hyperparameters found are commented at the bottom of the cells.

#### 1. Logistic Regression Model

In [None]:
# # Logistic Regression hyperparameter grid
# lr_params = {
#     'C': [0.01, 0.1, 1, 10],
#     'solver': ['lbfgs'],
#     'penalty': ['l2'],
#     'max_iter': [1000]
# }

# lr_grid = GridSearchCV(
#     LogisticRegression(),
#     lr_params,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1
# )

# lr_grid.fit(X_train, y_train.to_numpy())

# print("Best LR Params:", lr_grid.best_params_)
# print("Best LR Score:", lr_grid.best_score_)

# # Final tuned LR model
# lr_best = lr_grid.best_estimator_
# y_pred_lr_tuned = lr_best.predict(X_test)

# # Best LR Params: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}

In [None]:
# Use best hyperparameters
best_lr_model = LogisticRegression(
    C=0.1,
    max_iter=1000,
    penalty='l2',
    solver='lbfgs',
    random_state=42
)

# fit the model
best_lr_model.fit(X_train, y_train.values.ravel())

# make predictions
y_pred_lr = best_lr_model.predict(X_test)

In [None]:
# evaluate linear regression performance
print("Classification Report:\n")
print(classification_report(y_test.values.ravel(), y_pred_lr, target_names=labels))

cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Purples', xticklabels = labels, yticklabels = labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

#### 2. KNN

In [None]:
# # find the optimal value of k
# k_values = range(1, 40)
# accuracy_scores = []

# for k in k_values:
#     # Build KNN classifier for each k
#     knn = KNeighborsClassifier(n_neighbors = k, weights = 'uniform')
#     knn.fit(X_train, y_train.to_numpy())
    
#     # Predict on test set
#     pred = knn.predict(X_test)
    
#     # Calculate accuracy
#     acc = accuracy_score(y_test.to_numpy(), pred)
#     accuracy_scores.append(acc)

# plt.plot(k_values, accuracy_scores, marker='o', linestyle='dashed', color='green')
# plt.xlabel("K Value")
# plt.ylabel("Accuracy")
# plt.title("Finding Optimal K for Binary Classification")
# plt.grid(True)
# plt.show()

# best_k = k_values[np.argmax(accuracy_scores)]
# print(f"Best K found: {best_k} with Accuracy: {max(accuracy_scores):.4f}")

# # Best K found: 31

In [None]:
# train knn with optimal k
knn_best = KNeighborsClassifier(n_neighbors=31, weights='uniform')

# Fit the model
knn_best.fit(X_train, y_train.values.ravel()) 

# Prediction
y_pred_knn = knn_best.predict(X_test)

# visualize success of knn
cm = confusion_matrix(y_test, y_pred_knn)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("KNN Confusion Matrix")
plt.show()

# Evaluation
print(f"--- FINAL KNN RESULTS (K=31) ---")
print("Classification Report:\n")
print(classification_report(y_test.values.ravel(), y_pred_knn, target_names=labels))

In [None]:
y_prob_knn = knn_best.predict_proba(X_test)[:, 1]

fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test.values.ravel(), y_prob_knn)

roc_auc_knn = auc(fpr_knn, tpr_knn)

plt.figure(figsize=(8, 6))
plt.plot(fpr_knn, tpr_knn, color='blue', lw=2, label=f'KNN (k=37) AUC = {roc_auc_knn:.4f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: K-Nearest Neighbors')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# PCA visualization
X_train_raw_df = pd.DataFrame(X_train_original)   # copy before smote + feature selection
y_train_raw_df = pd.Series(y_train_original)

# Subsample
X_vis = X_train_raw_df.sample(3000, random_state=42)
y_vis = y_train_raw_df.loc[X_vis.index]

# PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_vis)
X_test_pca = pca.transform(X_test_original)

# KNN for decision boundary
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train_pca, y_vis.values.ravel())

# Meshgrid
h = 1
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1

xx, yy = np.meshgrid(
    np.arange(x_min, x_max, h),
    np.arange(y_min, y_max, h)
)

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

# Plot
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, cmap=ListedColormap(['#FFCCCC', '#CCCCFF']), alpha=0.3)
plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1],
            c=y_test.values.ravel(),
            cmap=ListedColormap(['#FF0000', '#0000FF']),
            edgecolor='k', s=30, alpha=0.7)
plt.title("KNN Decision Boundary (PCA of Original Data)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [None]:
run() test

#### 3. Random Forest Classifier

In [None]:
# # Random Forest hyperparameter grid
# rf_params = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 15, 20],
#     'max_features': ['sqrt', 'log2']
# }

# rf_grid = GridSearchCV(
#     RandomForestClassifier(random_state=42, n_jobs=-1),
#     rf_params,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )

# rf_grid.fit(X_train, y_train.to_numpy())

# print("Best RF Params:", rf_grid.best_params_)
# print("Best RF Score:", rf_grid.best_score_)

# # Final tuned RF model
# rf_best = rf_grid.best_estimator_
# y_pred_rf_tuned = rf_best.predict(X_test)

# # Best RF Params: {'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 300}

In [None]:
# Random Forest with tuned parameters
best_rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    max_features='sqrt',
    random_state=42,
    class_weight='balanced'
)

best_rf_model.fit(X_train, y_train.values.ravel())

y_pred_rf = best_rf_model.predict(X_test)

# Evaluation
print("\nClassification Report:\n")
print(classification_report(y_test.values.ravel(), y_pred_rf, target_names=labels))

# Confusion matrix
cm = confusion_matrix(y_test.values.ravel(), y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

# Feature importance
importances = best_rf_model.feature_importances_
feature_names = X_train.columns
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_df = feature_df.sort_values(by='Importance', ascending=False).head(15)
print(feature_df)

In [None]:
# Get feature importance from the trained model
importances = best_rf_model.feature_importances_

# Create a DataFrame to organize them
# Assuming X_train was a DataFrame originally. If it was a numpy array, 
# we generate generic names Feature_0, Feature_1, etc.
try:
    feature_names = X_train.columns
except:
    feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]

feature_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot Top 20 Features
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_imp_df.head(20), palette='viridis')
plt.title("Top 20 Features driving the Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.show()

print("Top 5 Most Important Features:")
print(feature_imp_df.head(5))

#### 4. XGBoost

In [None]:
# # Define the model
# xgb = XGBClassifier(random_state=42, objective='binary:logistic', n_jobs=-1)

# # Define hyperparameter grid
# xgb_param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 6, 10],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0]
# }

# # Grid search with CV
# xgb_grid = GridSearchCV(
#     xgb,
#     xgb_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )

# xgb_grid.fit(X_train, y_train)

# print("Best XGBoost Params:", xgb_grid.best_params_)
# print("Best XGBoost Score:", xgb_grid.best_score_)

# # Best XGBoost Params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}

In [None]:
xgb_best_model = XGBClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.7,
    objective='binary:logistic',
    random_state=42,
    n_jobs=-1
)

xgb_best_model.fit(X_train, y_train)

y_pred_xgb = xgb_best_model.predict(X_test)

# evaluate
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_xgb, target_names=labels))

cm = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("XGBoost Binary Confusion Matrix")
plt.show()

#### 5. Stacking

In [None]:
# from sklearn.model_selection import GridSearchCV

# estimators = [
#     ('rf', best_rf_model),  # from tuned Random Forest
#     ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
#     ('lr', LogisticRegression(random_state=42))
# ]

# stacking_model = StackingClassifier(
#     estimators=estimators,
#     final_estimator=LogisticRegression(),
#     n_jobs=-1
# )

# param_grid = {
#     'final_estimator__C': [0.01, 0.1, 1, 10]
# }

# grid = GridSearchCV(stacking_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
# grid.fit(X_train, y_train.values.ravel())

# stack_best = grid.best_estimator_
# y_pred_stack = stack_best.predict(X_test)

# print("Best C for final estimator:", grid.best_params_['final_estimator__C'])

# # Best C for final estimator: 1

In [None]:
# Define base learners using your tuned models
estimators = [
    ('rf', best_rf_model),  # tuned Random Forest
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(random_state=42))
]

# Define the stacking model with the best final estimator parameter
stacking_best_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(C=1, random_state=42),
    n_jobs=-1
)

# Fit model
stacking_best_model.fit(X_train, y_train.values.ravel())

y_pred_stack = stacking_best_model.predict(X_test)

# Classification report
print("\nClassification Report:\n")
print(classification_report(y_test.values.ravel(), y_pred_stack, target_names=labels))

# Confusion matrix
cm_stack = confusion_matrix(y_test.values.ravel(), y_pred_stack)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_stack, annot=True, fmt='d', cmap='Purples', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Stacking Classifier Confusion Matrix")
plt.show()

In [None]:
y_probability_stacking = stacking_best_model.predict_proba(X_test)[:, 1]

fpr_stack, tpr_stack, thresholds_stack = roc_curve(y_test.values.ravel(), y_probability_stacking)

roc_auc_stack = auc(fpr_stack, tpr_stack)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_stack, tpr_stack, color='purple', lw=2, label=f'Stacking AUC = {roc_auc_stack:.4f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: Stacking Classifier')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

#### 6. Support Vector Machine

In [None]:
# # define parameter grid
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'loss': ['hinge', 'squared_hinge'],
#     'max_iter': [1000, 5000, 10000]
# }

# # GridSearchCV
# grid = GridSearchCV(
#     estimator=LinearSVC(random_state=42),
#     param_grid=param_grid,
#     cv=5,                  # 5-fold cross-validation
#     scoring='accuracy',    # you can also try 'f1' if dataset is imbalanced
#     n_jobs=-1
# )

# # fit
# grid.fit(X_train, y_train.values.ravel())

# # best model
# best_svm = grid.best_estimator_

# print("Best hyperparameters:", grid.best_params_)

# # Best hyperparameters: {'C': 0.1, 'loss': 'squared_hinge', 'max_iter': 1000}


In [None]:
best_svm_model = LinearSVC(
    C=0.1,
    loss='squared_hinge',
    max_iter=1000,
    random_state=42
)

best_svm_model.fit(X_train, y_train.values.ravel())

y_pred_svm = best_svm_model.predict(X_test)

print("\nClassification Report:\n")
print(classification_report(y_test.values.ravel(), y_pred_svm, target_names=labels))

cm_svm = confusion_matrix(y_test.values.ravel(), y_pred_svm)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Oranges', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("LinearSVC Confusion Matrix")
plt.show()

In [None]:
y_probability_svm = best_svm_model.decision_function(X_test)

fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test.values.ravel(), y_probability_svm)

roc_auc_svm = auc(fpr_svm, tpr_svm)

plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, color='darkorange', lw=2, label=f'SVM AUC = {roc_auc_svm:.4f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: Support Vector Machine (SVM)')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

#### 7. Neural Network

In [None]:
# Define nn architecture
class ClassificationNN(nn.Module):
    def __init__(self, input_dim, dropout, num_classes=2):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim,input_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(input_dim, input_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(input_dim // 2, num_classes)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor  = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
# # Find best hyperparameters
# param_grid = {
#     "lr": [1e-2, 5e-2, 1e-3],
#     "dropout": [0.2, 0.3, 0.5],
#     "batch_size": [32, 64]
# }

# best_acc = 0
# best_params = None
# best_model = None

# criterion = nn.CrossEntropyLoss()

# for lr in param_grid["lr"]:
#     for dropout in param_grid["dropout"]:
#         for batch in param_grid["batch_size"]:
#             train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)

#             model = ClassificationNN(
#                 input_dim=X_train_tensor.shape[1],
#                 dropout=dropout,
#                 num_classes=2
#             )
            
#             optimizer = optim.Adam(model.parameters(), lr=lr)

#             # train for a few epochs
#             model.train()
#             for epoch in range(5):  
#                 for X_batch, y_batch in train_loader:
#                     optimizer.zero_grad()
#                     outputs = model(X_batch)
#                     loss = criterion(outputs, y_batch)
#                     loss.backward()
#                     optimizer.step()

#             # evaluate on test set
#             model.eval()
#             with torch.no_grad():
#                 outputs = model(X_test_tensor)
#                 preds = torch.argmax(outputs, dim=1)
#                 acc = (preds == y_test_tensor).float().mean().item()

#             if acc > best_acc:
#                 best_acc = acc
#                 best_params = (lr, dropout, batch)
#                 best_model = model

# print("Best Neural Network Params:")
# print(f"Learning Rate: {best_params[0]}")
# print(f"Dropout: {best_params[1]}")
# print(f"Batch Size: {best_params[2]}")
# print(f"Validation Accuracy: {best_acc:.4f}")

# # Best Neural Network Params:
# # Learning Rate: 0.001
# # Dropout: 0.2
# # Batch Size: 32
# # Validation Accuracy: 0.6519


In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
model = ClassificationNN(X_train_tensor.shape[1], 0.2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
train_losses = []
test_losses = []

n_epochs = 20
for epoch in range(n_epochs):
    model.train()
    running_train_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_train_loss += loss.item() * X_batch.size(0)

    train_loss = running_train_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Test loss
    model.eval()
    running_test_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            running_test_loss += loss.item() * X_batch.size(0)

    test_loss = running_test_loss / len(test_loader.dataset)
    test_losses.append(test_loss)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}")

#evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1)

print("Classification Report:\n")
print(classification_report(y_test_tensor.numpy(), y_pred.numpy(), target_names=labels))

cm = confusion_matrix(y_test_tensor.numpy(), y_pred.numpy())
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

plt.figure(figsize=(8,5))
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Cross-Entropy Loss")
plt.title("Training vs Test Loss")
plt.legend()
plt.grid(True)
plt.show()

#### 8. Decision Tree

In [None]:
# # Hyperparameter grid for Decision Tree
# dt_params = {
#     'max_depth': [5, 10, 15, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'criterion': ['gini', 'entropy']
# }

# # GridSearchCV to find best parameters
# dt_grid = GridSearchCV(
#     DecisionTreeClassifier(random_state=42),
#     dt_params,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1
# )

# # Fit to training data
# dt_grid.fit(X_train, y_train.to_numpy())

# # Best parameters and score
# print("Best Decision Tree Params:", dt_grid.best_params_)
# print("Best Decision Tree Score:", dt_grid.best_score_)

# # Final tuned model
# dt_best = dt_grid.best_estimator_
# y_pred_dt = dt_best.predict(X_test)

# # Best Decision Tree Params: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}

In [None]:
# Create the Decision Tree model with best parameters
dt_best_model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)

# Fit the model
dt_best_model.fit(X_train, y_train.values.ravel())

# Make predictions
y_pred_dt = dt_best_model.predict(X_test)

# Classification report
print("\nClassification Report:\n")
print(classification_report(y_test.values.ravel(), y_pred_dt, target_names=labels))

# Confusion matrix
cm_dt = confusion_matrix(y_test.values.ravel(), y_pred_dt)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Decision Tree Confusion Matrix")
plt.show()

In [None]:
y_probability_stacking = stacking_best_model.predict_proba(X_test)[:, 1]

fpr_stack, tpr_stack, thresholds_stack = roc_curve(y_test.values.ravel(), y_probability_stacking)

roc_auc_stack = auc(fpr_stack, tpr_stack)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_stack, tpr_stack, color='purple', lw=2, label=f'Stacking AUC = {roc_auc_stack:.4f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: Stacking Classifier')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# feature importance
importances = dt_best_model.feature_importances_
try:
    feature_names = X_train.columns
except AttributeError:
    feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]

feature_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot Top 20 Features
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_imp_df.head(20), palette='viridis')
plt.title("Top 20 Features driving Decision Tree")
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.show()

# Print top 5 features
print("Top 5 Most Important Features:")
print(feature_imp_df.head(5))