In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
)
from sklearn.decomposition import PCA

In [2]:
# -------------------------
# Load data and embeddings
# -------------------------
df = pd.read_csv("dataset/2_data_clean.csv")
loaded_embeddings = np.load("dataset/3_embeddings.npy")

# -------------------------
# Feature preparation
# -------------------------
scaler = StandardScaler()
length_scaled = scaler.fit_transform(df[['combined_len']])

# Combine features
X = np.hstack((loaded_embeddings, length_scaled))
y = df['label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# -------------------------
# Helper functions
# -------------------------
def train_grid_search(model, param_grid, X_train, y_train, scoring="f1_macro", cv=5):
    """Perform GridSearchCV for a given model and parameters."""
    grid = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1)
    grid.fit(X_train, y_train)
    print(f"Best parameters: {grid.best_params_}")
    print(f"Best CV score: {grid.best_score_:.4f}")
    return grid

def evaluate_model(model, X_test, y_test):
    """Evaluate model and print metrics."""
    pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
    print("Precision:", precision_score(y_test, pred))
    print("Recall:", recall_score(y_test, pred))
    print("Classification Report:\n", classification_report(y_test, pred))

In [4]:
# -------------------------
# Logistic Regression
# -------------------------
log = LogisticRegression(max_iter=1000)
param_grid_logreg = {
    "C": [0.01, 0.1, 1],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"]
}
log_model = train_grid_search(log, param_grid_logreg, X_train, y_train)
evaluate_model(log_model, X_test, y_test)

Best parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV score: 0.9550
Accuracy: 0.9556890724746526
Confusion Matrix:
 [[3801  195]
 [ 159 3834]]
Precision: 0.9516008935219658
Recall: 0.9601803155522164
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      3996
           1       0.95      0.96      0.96      3993

    accuracy                           0.96      7989
   macro avg       0.96      0.96      0.96      7989
weighted avg       0.96      0.96      0.96      7989



In [5]:
# -------------------------
# Linear SVM
# -------------------------
svm = LinearSVC(max_iter=2000)
param_grid_svm = {
    "C": [0.01, 0.1, 1, 10, 100],
    "loss": ["hinge", "squared_hinge"]
}
svm_model = train_grid_search(svm, param_grid_svm, X_train, y_train)
evaluate_model(svm_model, X_test, y_test)

Best parameters: {'C': 100, 'loss': 'squared_hinge'}
Best CV score: 0.9600
Accuracy: 0.9624483665039429
Confusion Matrix:
 [[3832  164]
 [ 136 3857]]
Precision: 0.9592141258393434
Recall: 0.9659403956924618
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      3996
           1       0.96      0.97      0.96      3993

    accuracy                           0.96      7989
   macro avg       0.96      0.96      0.96      7989
weighted avg       0.96      0.96      0.96      7989



In [6]:
# -------------------------
# Gradient Boosting with PCA
# -------------------------
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_pca, y_train)
evaluate_model(gb_model, X_test_pca, y_test)

Accuracy: 0.8904744023031669
Confusion Matrix:
 [[3587  409]
 [ 466 3527]]
Precision: 0.8960873983739838
Recall: 0.8832957675932882
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.89      3996
           1       0.90      0.88      0.89      3993

    accuracy                           0.89      7989
   macro avg       0.89      0.89      0.89      7989
weighted avg       0.89      0.89      0.89      7989

