# Start

In [None]:
!pip install xgboost gensim tqdm hmmlearn sklearn-crfsuite transformers


# Import

In [None]:
# Standard Library Imports
import os
import sys
import pickle
import joblib
import logging
import torch
import numpy as np
import pandas as pd
from statistics import mean

# Visualization Libraries
import matplotlib.pyplot as plt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier

# Specialized Libraries
import xgboost as xgb
import hmmlearn.hmm
from sklearn_crfsuite import CRF

# Natural Language Processing (NLP) Libraries
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModel

# Progress Bar
from tqdm import tqdm


In [None]:
## Options
pd.set_option("max_colwidth", None)

# Logger

In [None]:
class MyLogger:
    def __init__(self, log_file='app.log'):
        self.log_file = log_file
        self._initialize_logger()

    def _initialize_logger(self):
        if os.path.exists(self.log_file):
            file_mode = 'a'
        else:
            file_mode = 'w'

        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)

        file_handler = logging.FileHandler(self.log_file, mode=file_mode, encoding='utf-8')
        file_handler.setLevel(logging.INFO)

        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)

        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)

        if self.logger.hasHandlers():
            self.logger.handlers.clear()

        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)

    def log_message(self, message):
        self.logger.info(message)

    def change_log_file(self, new_log_file):
        self.log_file = new_log_file
        self._initialize_logger()

logger = MyLogger()

os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Feature Build

In [None]:
class FeatureBuilder:
    def __init__(self, method="tfidf", save_dir="data/processed", reduce_dim=None, n_components=100):
        self.method = method
        self.save_dir = save_dir
        self.reduce_dim = reduce_dim
        self.n_components = n_components
        os.makedirs(save_dir, exist_ok=True)
        
        if method == "tfidf":
            self.vectorizer = TfidfVectorizer(max_features=500)
        elif method == "count":
            self.vectorizer = CountVectorizer(max_features=500)
        elif method == "binary_count":
            self.vectorizer = CountVectorizer(binary=True, max_features=500)
        elif method == "word2vec":
            self.word2vec_model = api.load("word2vec-google-news-300")  # Pretrained Google News Word2Vec
        elif method == "glove":
            self.glove_model = api.load("glove-wiki-gigaword-100")  # Pretrained GloVe embeddings
        elif method == "bert":
            self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            self.bert_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    
        if self.reduce_dim == "pca":
            self.reducer = PCA(n_components=self.n_components)
        elif self.reduce_dim == "lda":
            self.reducer = LDA(n_components=min(self.n_components, 1))  # LDA needs class labels, adjust accordingly
    
    def _get_word2vec_vector(self, doc):
        tokens = doc.split()
        word_vectors = []
        for token in tokens:
            if token in self.word2vec_model:  # Access word directly
                word_vectors.append(self.word2vec_model[token])  # No need for '.wv'
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(self.word2vec_model.vector_size)

    def _get_glove_vector(self, doc):
        tokens = doc.split()
        word_vectors = []
        for token in tokens:
            if token in self.glove_model:  # Same for GloVe
                word_vectors.append(self.glove_model[token])  # Use directly without '.wv'
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(self.glove_model.vector_size)


    def _get_bert_embedding(self, doc):
        inputs = self.tokenizer(doc, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        return outputs.pooler_output.squeeze(0).numpy()
    
    def fit(self, texts, labels=None):
        if self.method in ["tfidf", "count", "binary_count"]:
            self.vectorizer.fit(texts)
        elif self.method in ["word2vec", "glove", "bert"]:
            pass


    def transform(self, texts, labels=None):
        if self.method in ["tfidf", "count", "binary_count"]:
            features = self.vectorizer.transform(texts).toarray()

        elif self.method == "word2vec":
            word2vec_embeddings = []
            for doc in tqdm(texts, desc="Processing Word2Vec", unit="document"):
                word2vec_embeddings.append(self._get_word2vec_vector(doc))
            features = np.array(word2vec_embeddings)

        elif self.method == "glove":
            glove_embeddings = []
            for doc in tqdm(texts, desc="Processing GloVe", unit="document"):
                glove_embeddings.append(self._get_glove_vector(doc))
            features = np.array(glove_embeddings)

        elif self.method == "bert":
            bert_embeddings = []
            for doc in tqdm(texts, desc="Processing BERT", unit="document"):
                bert_embeddings.append(self._get_bert_embedding(doc))
            features = np.array(bert_embeddings)

        if self.reduce_dim and features is not None:
            if self.reduce_dim == "lda" and labels is not None:
                self.reducer.fit(features, labels)
            elif self.reduce_dim == "pca":
                self.reducer.fit(features)
            
            features = self.reducer.transform(features)

        return features

    def fit_transform(self, texts):
        self.fit(texts) 
        return self.transform(texts) 
    
    def _save_model(self):
        save_dir = self.save_dir if self.save_dir else "data/processed"
        os.makedirs(save_dir, exist_ok=True) 
        
        if self.method in ["tfidf", "count", "binary_count"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_vectorizer.pkl")
            with open(file_path, "wb") as f:
                pickle.dump(self.vectorizer, f)
        elif self.method in ["word2vec", "glove"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_model.pkl")
            with open(file_path, "wb") as f:
                if self.method == "word2vec":
                    pickle.dump(self.word2vec_model, f)
                elif self.method == "glove":
                    pickle.dump(self.glove_model, f)
        elif self.method == "bert":
            tokenizer_path = os.path.join(self.save_dir, "bert_tokenizer.pkl")
            model_path = os.path.join(self.save_dir, "bert_model.pkl")
            with open(tokenizer_path, "wb") as f:
                pickle.dump(self.tokenizer, f)
            with open(model_path, "wb") as f:
                pickle.dump(self.bert_model, f)
                
        if self.reduce_dim:
            reducer_path = os.path.join(self.save_dir, f"{self.reduce_dim}_reducer.pkl")
            with open(reducer_path, "wb") as f:
                pickle.dump(self.reducer, f)
    
    def _load_model(self):
        os.makedirs(self.save_dir, exist_ok=True)
        
        if self.method in ["tfidf", "count", "binary_count"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_vectorizer.pkl")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"No saved model found at {file_path}. Run `fit_transform` first.")
            with open(file_path, "rb") as f:
                self.vectorizer = pickle.load(f)
        elif self.method in ["word2vec", "glove"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_model.pkl")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"No saved model found at {file_path}. Run `fit_transform` first.")
            with open(file_path, "rb") as f:
                self.word2vec_model = pickle.load(f)
        elif self.method == "bert":
            tokenizer_path = os.path.join(self.save_dir, "bert_tokenizer.pkl")
            model_path = os.path.join(self.save_dir, "bert_model.pkl")
            if not os.path.exists(tokenizer_path) or not os.path.exists(model_path):
                raise FileNotFoundError(f"No saved BERT model found at {tokenizer_path} or {model_path}. Run `fit_transform` first.")
            with open(tokenizer_path, "rb") as f:
                self.tokenizer = pickle.load(f)
            with open(model_path, "rb") as f:
                self.bert_model = pickle.load(f)
        
        if self.reduce_dim:
            reducer_path = os.path.join(self.save_dir, f"{self.reduce_dim}_reducer.pkl")
            with open(reducer_path, "rb") as f:
                self.reducer = pickle.load(f)


# Models utils

In [None]:
def generate_binary_classification_model(X, y, model_algorithm, hyperparameters, needs_scaled = False, model_save_path="best_model.pkl", img_save_path=None):
    if os.path.exists(model_save_path):
        print(f"🔄 Loading existing model from {model_save_path}...")
        model_algorithm = joblib.load(model_save_path)
        return model_algorithm
    
    print(f"🚀 Training new model: {model_algorithm.__class__.__name__}...")
    logger.log_message(f"🚀 Training new model: {model_algorithm.__class__.__name__}...")
    if needs_scaled:
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(X)
        X = pd.DataFrame(scaled_features, index = X.index, columns = X.columns)
        
    gridsearchcv = GridSearchCV(estimator = model_algorithm,
                                param_grid = hyperparameters)
    gridsearchcv.fit(X, y)
    
    logger.log_message(f'Best hyperparameters: {gridsearchcv.best_params_}')
    
    model_algorithm.set_params(**gridsearchcv.best_params_)
    
    accuracy_scores, roc_auc_scores, f1_scores = [], [], []
    
    k_fold = KFold(n_splits = 5)
    
    for train_index, val_index in tqdm(k_fold.split(X), total=k_fold.get_n_splits(), desc="K-Fold Progress"):
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model_algorithm.fit(X_train, y_train)

        val_preds = model_algorithm.predict(X_val)

        val_accuracy = accuracy_score(y_val, val_preds)
        val_roc_auc_score = roc_auc_score(y_val, val_preds)
        val_f1_score = f1_score(y_val, val_preds)
        
        accuracy_scores.append(val_accuracy)
        roc_auc_scores.append(val_roc_auc_score)
        f1_scores.append(val_f1_score)
        
    logger.log_message(f'📊 Average Accuracy: {int(mean(accuracy_scores) * 100)}%')
    logger.log_message(f'📊 Average ROC AUC: {int(mean(roc_auc_scores) * 100)}%')
    logger.log_message(f'📊 Average F1 Score: {int(mean(f1_scores) * 100)}%')

    joblib.dump(model_algorithm, model_save_path)
    logger.log_message(f'💾 Model saved to {model_save_path}')
    
    if img_save_path:
        plt.figure(figsize=(10, 6))

        plt.plot(range(1, len(accuracy_scores) + 1), accuracy_scores, label="Accuracy", marker='o')
        plt.plot(range(1, len(roc_auc_scores) + 1), roc_auc_scores, label="ROC AUC", marker='o')

        plt.title("Validation Performance Across K-Folds")
        plt.xlabel("Fold Number")
        plt.ylabel("Score")
        plt.legend()

        plt.savefig(img_save_path)
        plt.close()
        print(f"📈 Plot saved to {img_save_path}")
        logger.log_message(f"📈 Plot saved to {img_save_path}")
    
    return model_algorithm


# Do here

## Dict

In [None]:
# Dictionary for models
MODEL_DICT = {
    "decision_tree": DecisionTreeClassifier,
    "perceptron": Perceptron,
    "bayesian": GaussianNB,
    "bayesian_enhanced": lambda: GaussianNB(var_smoothing=1e-9),
    "random_forest": RandomForestClassifier,
    "xgboost": xgb.XGBClassifier,
    "svm": SVC,
    "max_edge_classifier": MaxAbsScaler,
    "kernel_functions_svm": lambda: SVC(kernel='rbf'),
    "soft_margin_svm": lambda: SVC(C=1.0),
    # "lda": LDA,
    "logistic_regression": LogisticRegression,
    "hmm": lambda: hmmlearn.hmm.GaussianHMM(n_components=3),
    "crf": CRF,
}

# Dictionary for model parameters
MODEL_PARAMS = {
    "decision_tree": {
        "criterion": ["gini"],
        "max_depth": [10, 20],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_features": ["sqrt", "log2"]
    },
    
    # "decision_tree": {
    #     "criterion": ["gini", "entropy"],
    #     "max_depth": [10, 20, 30, 40],
    #     "min_samples_split": [2, 5, 10],
    #     "min_samples_leaf": [1, 2, 4],
    #     "max_features": ["auto", "sqrt", "log2"]
    # },
    
    "perceptron": {
        "max_iter": [1000, 2000],
        "tol": [1e-3],
        "eta0": [0.001],
        "penalty": ["l2"],
        "alpha": [0.0001, 0.001]
    },
    
    # "perceptron": {
    #     "max_iter": [1000, 2000],
    #     "tol": [1e-3, 1e-4],
    #     "eta0": [0.001, 0.01, 0.1],
    #     "penalty": [None, "l2", "l1"],
    #     "alpha": [0.0001, 0.001, 0.01]
    # },
    
    "bayesian": {
        "priors": [None, "uniform", "gaussian"],
        "var_smoothing": [1e-9, 1e-8, 1e-7]
    },
    
    "bayesian_enhanced": {
        "var_smoothing": [1e-9]
    },
    
    # "bayesian_enhanced": {
    #     "var_smoothing": [1e-9, 1e-8, 1e-7]
    # },
    
    "random_forest": {
        "n_estimators": [100],
        "max_depth": [10],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_features": ["sqrt"]
    },
    
    # "random_forest": {
    #     "n_estimators": [50, 100, 200],
    #     "max_depth": [None, 10, 20, 30],
    #     "min_samples_split": [2, 5, 10],
    #     "min_samples_leaf": [1, 2, 4],
    #     "max_features": ["auto", "sqrt", "log2"],
    #     "bootstrap": [True, False]
    # },
    
    "xgboost": {
        "n_estimators": [100],
        "learning_rate": [0.01, 0.1],
        "max_depth": [6, 10]
    },
    
    # "xgboost": {
    #     "n_estimators": [100, 200, 300],
    #     "learning_rate": [0.01, 0.1, 0.2],
    #     "max_depth": [3, 6, 10],
    #     "subsample": [0.8, 1.0],
    #     "colsample_bytree": [0.8, 1.0],
    #     "gamma": [0, 0.1, 0.2]
    # },
    
    "svm": {
        "kernel": ["linear"],
        "C": [0.1, 0.001]
    },
    
    # "svm": {
    #     "kernel": ["linear", "rbf", "poly"],
    #     "C": [0.1, 1, 10, 100],
    #     "gamma": [0.1, 0.01, "scale"],
    #     "degree": [2, 3, 4]
    # },
    
    "max_edge_classifier": {
        "scaler": ["maxabs", "standard"]
    },
    
    "kernel_functions_svm": {
        "kernel": ["rbf", "poly"],
        "C": [1.0, 10.0, 100.0],
        "gamma": ["scale", "auto"]
    },
    
    "soft_margin_svm": {
        "C": [0.1, 1.0, 10.0]
    },
    
    "lda": {
        "n_components": [2, 3, 4, 5],
        "solver": ["svd", "lsqr", "eigen"],
        "shrinkage": ["auto", None]
    },
    
    "logistic_regression": {
        "penalty": ["l2"],
        "C": [0.1, 1.0],
        "max_iter": [1000, 2000]
    },
    
    # "logistic_regression": {
    #     "penalty": ["l1", "l2", "elasticnet", None],
    #     "C": [0.1, 1.0, 10.0],
    #     "solver": ["liblinear", "lbfgs", "saga"],
    #     "max_iter": [1000, 2000]
    # },
    
    
    "hmm": {
        "n_components": [2, 3, 4],
        "covariance_type": ["diag", "full", "tied"],
        "n_iter": [100, 200],
        "init_params": ["c", "s", "cs"],
        "params": ["c", "t", "ct"]
    },
    
    "crf": {
        "algorithm": ["lbfgs", "newton-cg", "liblinear"],
        "max_iterations": [100, 200],
        "penalty": ["l2", "elasticnet"],
        "dual": [True, False],
        "tol": [1e-4, 1e-3],
    }
}

# Dictionary for dimensionality reduction methods
DIMENSIONALITY_REDUCTION_DICT = {
    "pca": PCA,
    "lda": LDA,
}

## Load dataset

In [None]:
dataset_path = "/kaggle/input/tweets-clean-posneg-v1"
df = pd.read_csv(f"{dataset_path}/final_clean_no_neutral_no_duplicates.csv")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df["target"] = df["target"].replace(4, 1)

In [None]:
# df_sampled = df.sample(n=100000, random_state=42)
df_sampled = df
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled["text_clean"], df_sampled["target"], test_size=0.2, random_state=42
)

logger.log_message(f"Training set size: {len(X_train)}")
logger.log_message(f"Test set size: {len(X_test)}")
logger.log_message(f"Training labels size: {len(y_train)}")
logger.log_message(f"Test labels size: {len(y_test)}")


In [None]:
# feature_methods = ["count", "word2vec", "glove", "bert"]
feature_methods = ["word2vec", "glove"]
X_train_features_dict = {}
X_test_features_dict = {}

In [None]:
output_dir = "/kaggle/working"

In [None]:
print("\n🔎 Running feature extraction...\n")
logger.log_message("\n🔎 Running feature extraction...\n")

for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
    print(f"\n🔍 Processing feature extraction using: {method}...")

    try:
        # Initialize FeatureBuilder
        feature_builder = FeatureBuilder(method=method, save_dir=os.path.join(output_dir, "processed"), reduce_dim="pca", n_components=50)

        # Fit and transform training data
        X_train_features = feature_builder.fit_transform(X_train.tolist())
        print(f"✅ {method} - X_train_features shape: {X_train_features.shape}")
        
        X_train_features_dict[method] = pd.DataFrame(X_train_features)

        X_test_features_dict[method] = feature_builder.transform(X_test.tolist())
        print(f"✅ {method} - X_test_features shape: {X_test_features_dict[method].shape}")

    except Exception as e:
        print(f"❌ Error with {method}: {e}")

In [None]:
model_names = ["decision_tree", "logistic_regression", "random_forest", "xgboost", "perceptron", "bayesian_enhanced", "svm"]

In [None]:
output_dir = "/kaggle/working"

# Train

In [None]:
print("\n🔎 Running model training loop...\n")
logger.log_message("\n🔎 Running feature extraction...\n")

for model_name in model_names:
    for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
        print(f"Model: {model_name}")
        print(f"🔍 Method: {method}...")
        logger.log_message(f"Model: {model_name}")
        logger.log_message(f"🔍 Method: {method}...")
    
        try:
            # Retrieve Decision Tree model and hyperparameters
            algorithm = MODEL_DICT[model_name]()
            params = MODEL_PARAMS[model_name]
    
            # Train or load model
            trained_model = generate_binary_classification_model(
                X=X_train_features_dict[method], 
                y=y_train, 
                model_algorithm=algorithm, 
                hyperparameters=params, 
                needs_scaled=False, 
                model_save_path=f"best_{model_name}_{method}.pkl",
                img_save_path=f"best_{model_name}_{method}.png"
            )
    
        except Exception as e:
            print(f"❌ Error with {method}: {e}")
            logger.log_message(f"❌ Error with {method}: {e}")

# Predict

In [None]:
# Predict for each model
for model_name in model_names:
    for method in feature_methods:
        # Load the saved model
        model_filename = os.path.join(output_dir, f"best_{model_name}_{method}.pkl")
        with open(model_filename, 'rb') as model_file:
            model = joblib.load(model_file)

        # Make predictions
        y_pred = model.predict(X_test_features_dict[method])

        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        
        # ROC AUC can be computed if the model outputs probabilities
        # Handle models that do not support `predict_proba`
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test_features_dict[method])[:, 1]  # Take the positive class probabilities
            roc_auc = roc_auc_score(y_test, y_prob)
        else:
            roc_auc = "N/A"  # Not applicable for models like Perceptron

        # Print metrics
        print(f"Model: {model_name}")
        print(f"Method: {method}")
        print("-" * 50)
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        if hasattr(model, "predict_proba"):
            print(f"ROC AUC: {roc_auc:.4f}")
        else:
            print("ROC AUC: N/A")
        print("-" * 50)

        logger.log_message(f"Model: {model_name}")
        logger.log_message(f"Method: {method}")
        logger.log_message("-" * 50)
        logger.log_message(f"Accuracy: {accuracy:.4f}")
        logger.log_message(f"Precision: {precision:.4f}")
        logger.log_message(f"Recall: {recall:.4f}")
        logger.log_message(f"F1 Score: {f1:.4f}")
        
        if hasattr(model, "predict_proba"):
            logger.log_message(f"ROC AUC: {roc_auc:.4f}")
        else:
            logger.log_message("ROC AUC: N/A")
        logger.log_message("-" * 50)
        
    print("%" * 50)
    logger.log_message("%" * 50)

# Remove output

In [None]:
# import shutil
# import os

# # Path to the working directory
# working_dir = '/kaggle/working/'

# # Remove all files and subdirectories in the working directory
# for filename in os.listdir(working_dir):
#     file_path = os.path.join(working_dir, filename)
#     if os.path.isfile(file_path):
#         os.remove(file_path)
#     else:
#         shutil.rmtree(file_path)

# print("All outputs cleared from Kaggle working directory.")


In [None]:
# import os

# # Path to the working directory
# working_dir = '/kaggle/working/'

# # Loop through files in the working directory
# for filename in os.listdir(working_dir):
#     if filename.endswith(".zip"):  # Check if it's a zip file
#         file_path = os.path.join(working_dir, filename)
#         os.remove(file_path)  # Remove the zip file
#         print(f"Removed: {file_path}")

# print("✅ All zip files removed from Kaggle working directory.")


# Zip all

In [None]:
# import shutil
# import os
# import zipfile

# # Define working directory and zip file path
# working_dir = "/kaggle/working/"
# zip_file_path = "/kaggle/working/output_files.zip"

# # Create a zip archive containing only specific files
# with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
#     for filename in os.listdir(working_dir):
#         if filename.endswith(".log") or filename.endswith(".png") or filename.endswith(".pkl"):
#             file_path = os.path.join(working_dir, filename)
#             zipf.write(file_path, filename)  # Save file with its name (without full path)

# print(f"✅ Selected outputs zipped into: {zip_file_path}")