# Import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MaxAbsScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import hmmlearn.hmm
from sklearn_crfsuite import CRF

In [4]:
## Options
pd.set_option("max_colwidth", None)

In [5]:
# Get the absolute path to the 'src' directory
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(project_root)
print(project_root)

e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3117_Machine_Learning\Main


In [6]:
from src.features.build_features_utils import *  # Assuming build_features_utils is inside build_features.py
from src.models.models_utils import *  # Assuming utils.py exists inside src/models/

  from .autonotebook import tqdm as notebook_tqdm


# Dict

In [21]:
# Dictionary for models
MODEL_DICT = {
    "decision_tree": DecisionTreeClassifier,
    "perceptron": Perceptron,
    "bayesian": GaussianNB,
    "bayesian_enhanced": lambda: GaussianNB(var_smoothing=1e-9),
    "random_forest": RandomForestClassifier,
    "xgboost": xgb.XGBClassifier,
    "svm": SVC,
    "max_edge_classifier": MaxAbsScaler,
    "kernel_functions_svm": lambda: SVC(kernel='rbf'),
    "soft_margin_svm": lambda: SVC(C=1.0),
    # "lda": LDA,
    "logistic_regression": LogisticRegression,
    "hmm": lambda: hmmlearn.hmm.GaussianHMM(n_components=3),
    "crf": CRF,
}

# Dictionary for model parameters
MODEL_PARAMS = {
    "decision_tree": {
        "criterion": ["gini"],
        "max_depth": [10],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_features": ["sqrt"]
    },
    
    # "decision_tree": {
    #     "criterion": ["gini", "entropy"],
    #     "max_depth": [10, 20, 30, 40],
    #     "min_samples_split": [2, 5, 10],
    #     "min_samples_leaf": [1, 2, 4],
    #     "max_features": ["auto", "sqrt", "log2"]
    # },
    
    "perceptron": {
        "max_iter": [1000],
        "tol": [1e-3],
        "eta0": [0.001],
        "penalty": ["l2"],
        "alpha": [0.0001]
    },
    
    # "perceptron": {
    #     "max_iter": [1000, 2000],
    #     "tol": [1e-3, 1e-4],
    #     "eta0": [0.001, 0.01, 0.1],
    #     "penalty": [None, "l2", "l1"],
    #     "alpha": [0.0001, 0.001, 0.01]
    # },
    
    "bayesian": {
        "priors": [None, "uniform", "gaussian"],
        "var_smoothing": [1e-9, 1e-8, 1e-7]
    },
    
    "bayesian_enhanced": {
        "var_smoothing": [1e-9]
    },
    
    # "bayesian_enhanced": {
    #     "var_smoothing": [1e-9, 1e-8, 1e-7]
    # },
    
    "random_forest": {
        "n_estimators": [100],
        "max_depth": [10],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_features": ["sqrt"]
    },
    
    # "random_forest": {
    #     "n_estimators": [50, 100, 200],
    #     "max_depth": [None, 10, 20, 30],
    #     "min_samples_split": [2, 5, 10],
    #     "min_samples_leaf": [1, 2, 4],
    #     "max_features": ["auto", "sqrt", "log2"],
    #     "bootstrap": [True, False]
    # },
    
    "xgboost": {
        "n_estimators": [100],
        "learning_rate": [0.01],
        "max_depth": [10]
    },
    
    # "xgboost": {
    #     "n_estimators": [100, 200, 300],
    #     "learning_rate": [0.01, 0.1, 0.2],
    #     "max_depth": [3, 6, 10],
    #     "subsample": [0.8, 1.0],
    #     "colsample_bytree": [0.8, 1.0],
    #     "gamma": [0, 0.1, 0.2]
    # },
    
    "svm": {
        "kernel": ["linear"],
        "C": [0.1]
    },
    
    # "svm": {
    #     "kernel": ["linear", "rbf", "poly"],
    #     "C": [0.1, 1, 10, 100],
    #     "gamma": [0.1, 0.01, "scale"],
    #     "degree": [2, 3, 4]
    # },
    
    "max_edge_classifier": {
        "scaler": ["maxabs", "standard"]
    },
    
    "kernel_functions_svm": {
        "kernel": ["rbf", "poly"],
        "C": [1.0, 10.0, 100.0],
        "gamma": ["scale", "auto"]
    },
    
    "soft_margin_svm": {
        "C": [0.1, 1.0, 10.0]
    },
    
    "lda": {
        "n_components": [2, 3, 4, 5],
        "solver": ["svd", "lsqr", "eigen"],
        "shrinkage": ["auto", None]
    },
    
    "logistic_regression": {
        "penalty": ["l2"],
        "max_iter": [1000]
    },
    
    # "logistic_regression": {
    #     "penalty": ["l1", "l2", "elasticnet", None],
    #     "C": [0.1, 1.0, 10.0],
    #     "solver": ["liblinear", "lbfgs", "saga"],
    #     "max_iter": [1000, 2000]
    # },
    
    
    "hmm": {
        "n_components": [2, 3, 4],
        "covariance_type": ["diag", "full", "tied"],
        "n_iter": [100, 200],
        "init_params": ["c", "s", "cs"],
        "params": ["c", "t", "ct"]
    },
    
    "crf": {
        "algorithm": ["lbfgs", "newton-cg", "liblinear"],
        "max_iterations": [100, 200],
        "penalty": ["l2", "elasticnet"],
        "dual": [True, False],
        "tol": [1e-4, 1e-3],
    }
}

# Dictionary for dimensionality reduction methods
DIMENSIONALITY_REDUCTION_DICT = {
    "pca": PCA,
    "lda": LDA,
}

# Load dataset

In [8]:
# Load dataset
dataset_path = os.path.join(project_root, "data", "final", "final_clean_no_neutral_no_duplicates.csv")
df = pd.read_csv(dataset_path)


In [9]:
df.head()

Unnamed: 0,target,text,text_clean,text_length,text_clean_length
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",switchfoot awww thats a bummer you shoulda got david carr of third day to do it d,19,17
1,0.0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,is upset that he cant update his facebook by texting it and might cry a a result school today also blah,21,21
2,0.0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,kenichan i dived many time for the ball managed to save the rest go out of bound,18,17
3,0.0,my whole body feels itchy and like its on fire,my whole body feel itchy and like it on fire,10,10
4,0.0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",nationwideclass no it not behaving at all im mad why am i here because i cant see you all over there,21,21


In [10]:
# Replace target 4 with 1
df["target"] = df["target"].replace(4, 1)


In [11]:
# Sample exactly 1,000 random entries from the dataset
df_sampled = df.sample(n=2000, random_state=42)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled["text_clean"], df_sampled["target"], test_size=0.2, random_state=42
)

# Print lengths of splits
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training labels size: {len(y_train)}")
print(f"Test labels size: {len(y_test)}")


Training set size: 1600
Test set size: 400
Training labels size: 1600
Test labels size: 400


In [12]:
# List of feature extraction methods
# feature_methods = ["tfidf", "count", "binary_count", "word2vec", "glove", "bert"]

feature_methods = ["count", "word2vec", "glove", "bert"]

X_train_features_dict = {}

In [13]:
print("\n🔎 Running feature extraction...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        # Initialize FeatureBuilder
        feature_builder = FeatureBuilder(method=method, save_dir=os.path.join(project_root, "data", "processed"), reduce_dim="pca", n_components=50)

        # Fit and transform training data
        X_train_features = feature_builder.fit_transform(X_train.tolist())
        print(f"✅ {method} - X_train_features shape: {X_train_features.shape}")
        
        X_train_features_dict[method] = pd.DataFrame(X_train_features)

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...


Feature Extraction Progress:  25%|██▌       | 1/4 [00:04<00:12,  4.25s/it]

✅ count - X_train_features shape: (1600, 50)

🔍 Processing feature extraction using: word2vec...


Processing Word2Vec: 100%|██████████| 1600/1600 [00:00<00:00, 7001.61document/s]
Feature Extraction Progress:  50%|█████     | 2/4 [02:18<02:41, 80.58s/it]

✅ word2vec - X_train_features shape: (1600, 50)

🔍 Processing feature extraction using: glove...


Processing GloVe: 100%|██████████| 1600/1600 [00:00<00:00, 7966.33document/s]
Feature Extraction Progress:  75%|███████▌  | 3/4 [03:31<01:17, 77.23s/it]

✅ glove - X_train_features shape: (1600, 50)

🔍 Processing feature extraction using: bert...


Processing BERT: 100%|██████████| 1600/1600 [00:44<00:00, 35.86document/s]
Feature Extraction Progress: 100%|██████████| 4/4 [04:25<00:00, 66.46s/it]

✅ bert - X_train_features shape: (1600, 50)





# Model chosen

## Decision Tree

In [14]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name = "decision_tree"
        
        # Retrieve Decision Tree model and hyperparameters
        decision_tree_algorithm = MODEL_DICT[model_name]()
        decision_tree_params = MODEL_PARAMS[model_name]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=decision_tree_algorithm, 
            hyperparameters=decision_tree_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name}_{method}.pkl",
            img_save_path=f"best_{model_name}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: DecisionTreeClassifier...
Best hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00,  9.04it/s]


📊 Average Accuracy: 54%
📊 Average ROC AUC: 53%
📊 Average F1 Score: 56%
💾 Model saved to best_decision_tree_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:02<00:06,  2.10s/it]

📈 Plot saved to best_decision_tree_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: DecisionTreeClassifier...
Best hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 11.15it/s]


📊 Average Accuracy: 58%
📊 Average ROC AUC: 58%
📊 Average F1 Score: 60%
💾 Model saved to best_decision_tree_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [00:03<00:03,  1.74s/it]

📈 Plot saved to best_decision_tree_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: DecisionTreeClassifier...
Best hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 15.23it/s]


📊 Average Accuracy: 54%
📊 Average ROC AUC: 54%
📊 Average F1 Score: 56%
💾 Model saved to best_decision_tree_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [00:05<00:01,  1.68s/it]

📈 Plot saved to best_decision_tree_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: DecisionTreeClassifier...
Best hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 16.61it/s]


📊 Average Accuracy: 52%
📊 Average ROC AUC: 52%
📊 Average F1 Score: 54%
💾 Model saved to best_decision_tree_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [00:06<00:00,  1.63s/it]

📈 Plot saved to best_decision_tree_bert.png





## Logistic Regression

In [15]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name_lg = "logistic_regression"
        
        # Retrieve Log Reg model and hyperparameters
        logreg_algorithm = MODEL_DICT[model_name_lg]()
        logreg_params = MODEL_PARAMS[model_name_lg]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=logreg_algorithm, 
            hyperparameters=logreg_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name_lg}_{method}.pkl",
            img_save_path=f"best_{model_name_lg}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: LogisticRegression...
Best hyperparameters: {'max_iter': 1000, 'penalty': 'l2'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 12.93it/s]


📊 Average Accuracy: 62%
📊 Average ROC AUC: 62%
📊 Average F1 Score: 66%
💾 Model saved to best_logistic_regression_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:02<00:08,  2.68s/it]

📈 Plot saved to best_logistic_regression_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: LogisticRegression...
Best hyperparameters: {'max_iter': 1000, 'penalty': 'l2'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 14.80it/s]


📊 Average Accuracy: 69%
📊 Average ROC AUC: 69%
📊 Average F1 Score: 72%
💾 Model saved to best_logistic_regression_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [00:04<00:04,  2.04s/it]

📈 Plot saved to best_logistic_regression_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: LogisticRegression...
Best hyperparameters: {'max_iter': 1000, 'penalty': 'l2'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 19.17it/s]


📊 Average Accuracy: 66%
📊 Average ROC AUC: 66%
📊 Average F1 Score: 69%
💾 Model saved to best_logistic_regression_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [00:05<00:01,  1.70s/it]

📈 Plot saved to best_logistic_regression_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: LogisticRegression...
Best hyperparameters: {'max_iter': 1000, 'penalty': 'l2'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 23.82it/s]


📊 Average Accuracy: 60%
📊 Average ROC AUC: 59%
📊 Average F1 Score: 64%
💾 Model saved to best_logistic_regression_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [00:06<00:00,  1.67s/it]

📈 Plot saved to best_logistic_regression_bert.png





## Random Forest

In [16]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name_rf = "random_forest"
        
        # Retrieve Log Reg model and hyperparameters
        rf_algorithm = MODEL_DICT[model_name_rf]()
        rf_params = MODEL_PARAMS[model_name_rf]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=rf_algorithm, 
            hyperparameters=rf_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name_rf}_{method}.pkl",
            img_save_path=f"best_{model_name_rf}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: RandomForestClassifier...
Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:07<00:00,  1.55s/it]


📊 Average Accuracy: 61%
📊 Average ROC AUC: 60%
📊 Average F1 Score: 64%
💾 Model saved to best_random_forest_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:18<00:55, 18.41s/it]

📈 Plot saved to best_random_forest_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: RandomForestClassifier...
Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:06<00:00,  1.22s/it]


📊 Average Accuracy: 67%
📊 Average ROC AUC: 67%
📊 Average F1 Score: 69%
💾 Model saved to best_random_forest_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [00:34<00:34, 17.05s/it]

📈 Plot saved to best_random_forest_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: RandomForestClassifier...
Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:06<00:00,  1.24s/it]


📊 Average Accuracy: 64%
📊 Average ROC AUC: 64%
📊 Average F1 Score: 67%
💾 Model saved to best_random_forest_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [00:49<00:16, 16.26s/it]

📈 Plot saved to best_random_forest_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: RandomForestClassifier...
Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:06<00:00,  1.33s/it]


📊 Average Accuracy: 58%
📊 Average ROC AUC: 58%
📊 Average F1 Score: 64%
💾 Model saved to best_random_forest_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [01:05<00:00, 16.41s/it]

📈 Plot saved to best_random_forest_bert.png





## XGBoost

In [17]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name_xgb = "xgboost"
        
        # Retrieve Log Reg model and hyperparameters
        xgb_algorithm = MODEL_DICT[model_name_xgb]()
        xgb_params = MODEL_PARAMS[model_name_xgb]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=xgb_algorithm, 
            hyperparameters=xgb_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name_xgb}_{method}.pkl",
            img_save_path=f"best_{model_name_xgb}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: XGBClassifier...
Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:15<00:00,  3.15s/it]


📊 Average Accuracy: 58%
📊 Average ROC AUC: 57%
📊 Average F1 Score: 61%
💾 Model saved to best_xgboost_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:39<01:58, 39.57s/it]

📈 Plot saved to best_xgboost_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: XGBClassifier...
Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:21<00:00,  4.35s/it]


📊 Average Accuracy: 64%
📊 Average ROC AUC: 63%
📊 Average F1 Score: 66%
💾 Model saved to best_xgboost_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [01:25<01:26, 43.22s/it]

📈 Plot saved to best_xgboost_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: XGBClassifier...
Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it]


📊 Average Accuracy: 61%
📊 Average ROC AUC: 61%
📊 Average F1 Score: 65%
💾 Model saved to best_xgboost_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [02:14<00:46, 46.02s/it]

📈 Plot saved to best_xgboost_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: XGBClassifier...
Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]


📊 Average Accuracy: 57%
📊 Average ROC AUC: 56%
📊 Average F1 Score: 62%
💾 Model saved to best_xgboost_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [02:58<00:00, 44.70s/it]

📈 Plot saved to best_xgboost_bert.png





## Perceptron

In [18]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name_perceptron = "perceptron"
        
        # Retrieve Log Reg model and hyperparameters
        per_algorithm = MODEL_DICT[model_name_perceptron]()
        per_params = MODEL_PARAMS[model_name_perceptron]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=per_algorithm, 
            hyperparameters=per_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name_perceptron}_{method}.pkl",
            img_save_path=f"best_{model_name_perceptron}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: Perceptron...
Best hyperparameters: {'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.001}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 21.23it/s]


📊 Average Accuracy: 58%
📊 Average ROC AUC: 57%
📊 Average F1 Score: 63%
💾 Model saved to best_perceptron_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:01<00:03,  1.26s/it]

📈 Plot saved to best_perceptron_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: Perceptron...
Best hyperparameters: {'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.001}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 17.84it/s]


📊 Average Accuracy: 59%
📊 Average ROC AUC: 60%
📊 Average F1 Score: 45%
💾 Model saved to best_perceptron_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [00:02<00:02,  1.35s/it]

📈 Plot saved to best_perceptron_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: Perceptron...
Best hyperparameters: {'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.001}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 17.85it/s]


📊 Average Accuracy: 58%
📊 Average ROC AUC: 59%
📊 Average F1 Score: 53%
💾 Model saved to best_perceptron_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [00:04<00:01,  1.44s/it]

📈 Plot saved to best_perceptron_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: Perceptron...
Best hyperparameters: {'alpha': 0.0001, 'eta0': 0.001, 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.001}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 15.88it/s]


📊 Average Accuracy: 55%
📊 Average ROC AUC: 55%
📊 Average F1 Score: 52%
💾 Model saved to best_perceptron_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [00:05<00:00,  1.43s/it]

📈 Plot saved to best_perceptron_bert.png





## Bayesian

In [19]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name_bayes = "bayesian_enhanced"
        
        # Retrieve Log Reg model and hyperparameters
        bayes_algorithm = MODEL_DICT[model_name_bayes]()
        bayes_params = MODEL_PARAMS[model_name_bayes]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=bayes_algorithm, 
            hyperparameters=bayes_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name_bayes}_{method}.pkl",
            img_save_path=f"best_{model_name_bayes}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: GaussianNB...
Best hyperparameters: {'var_smoothing': 1e-09}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 21.18it/s]


📊 Average Accuracy: 57%
📊 Average ROC AUC: 57%
📊 Average F1 Score: 63%
💾 Model saved to best_bayesian_enhanced_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:01<00:03,  1.19s/it]

📈 Plot saved to best_bayesian_enhanced_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: GaussianNB...
Best hyperparameters: {'var_smoothing': 1e-09}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 23.02it/s]


📊 Average Accuracy: 57%
📊 Average ROC AUC: 58%
📊 Average F1 Score: 50%
💾 Model saved to best_bayesian_enhanced_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [00:02<00:02,  1.12s/it]

📈 Plot saved to best_bayesian_enhanced_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: GaussianNB...
Best hyperparameters: {'var_smoothing': 1e-09}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 17.49it/s]


📊 Average Accuracy: 57%
📊 Average ROC AUC: 58%
📊 Average F1 Score: 50%
💾 Model saved to best_bayesian_enhanced_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [00:03<00:01,  1.20s/it]

📈 Plot saved to best_bayesian_enhanced_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: GaussianNB...
Best hyperparameters: {'var_smoothing': 1e-09}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00, 23.10it/s]


📊 Average Accuracy: 59%
📊 Average ROC AUC: 59%
📊 Average F1 Score: 60%
💾 Model saved to best_bayesian_enhanced_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [00:04<00:00,  1.21s/it]

📈 Plot saved to best_bayesian_enhanced_bert.png





## SVM

In [23]:
print("\n🔎 Running feature extraction and model training loop...\n")
for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        model_name_svm = "svm"
        
        # Retrieve Log Reg model and hyperparameters
        svm_algorithm = MODEL_DICT[model_name_svm]()
        svm_params = MODEL_PARAMS[model_name_svm]

        # Train or load model
        trained_model = generate_binary_classification_model(
            X=X_train_features_dict[method], 
            y=y_train, 
            model_algorithm=svm_algorithm, 
            hyperparameters=svm_params, 
            needs_scaled=False, 
            model_save_path=f"best_{model_name_svm}_{method}.pkl",
            img_save_path=f"best_{model_name_svm}_{method}.png"
        )

    except Exception as e:
        print(f"❌ Error with {method}: {e}")


🔎 Running feature extraction and model training loop...



Feature Extraction Progress:   0%|          | 0/4 [00:00<?, ?it/s]


🔍 Processing feature extraction using: count...
🚀 Training new model: SVC...
Best hyperparameters: {'C': 0.1, 'kernel': 'linear'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00,  5.62it/s]


📊 Average Accuracy: 61%
📊 Average ROC AUC: 60%
📊 Average F1 Score: 67%
💾 Model saved to best_svm_count.pkl


Feature Extraction Progress:  25%|██▌       | 1/4 [00:02<00:07,  2.67s/it]

📈 Plot saved to best_svm_count.png

🔍 Processing feature extraction using: word2vec...
🚀 Training new model: SVC...
Best hyperparameters: {'C': 0.1, 'kernel': 'linear'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00,  6.55it/s]


📊 Average Accuracy: 65%
📊 Average ROC AUC: 64%
📊 Average F1 Score: 72%
💾 Model saved to best_svm_word2vec.pkl


Feature Extraction Progress:  50%|█████     | 2/4 [00:05<00:05,  2.60s/it]

📈 Plot saved to best_svm_word2vec.png

🔍 Processing feature extraction using: glove...
🚀 Training new model: SVC...
Best hyperparameters: {'C': 0.1, 'kernel': 'linear'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:00<00:00,  8.81it/s]


📊 Average Accuracy: 66%
📊 Average ROC AUC: 66%
📊 Average F1 Score: 69%
💾 Model saved to best_svm_glove.pkl


Feature Extraction Progress:  75%|███████▌  | 3/4 [00:07<00:02,  2.25s/it]

📈 Plot saved to best_svm_glove.png

🔍 Processing feature extraction using: bert...
🚀 Training new model: SVC...
Best hyperparameters: {'C': 0.1, 'kernel': 'linear'}

🎯 Running K-Fold Cross-Validation...


K-Fold Progress: 100%|██████████| 5/5 [00:02<00:00,  2.49it/s]


📊 Average Accuracy: 54%
📊 Average ROC AUC: 52%
📊 Average F1 Score: 69%
💾 Model saved to best_svm_bert.pkl


Feature Extraction Progress: 100%|██████████| 4/4 [00:11<00:00,  2.79s/it]

📈 Plot saved to best_svm_bert.png



