# Start

In [None]:
!pip install xgboost gensim tqdm hmmlearn pgmpy sklearn-crfsuite transformers tensorflow keras keras-tuner


# Import

In [None]:
# Standard Library Imports
import os
import sys
import pickle
import joblib
import logging
import torch
import numpy as np
import pandas as pd
from statistics import mean

# Visualization Libraries
import matplotlib.pyplot as plt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, log_loss, hinge_loss
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier

from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, Counter
from hmmlearn import hmm

nltk.download('punkt')
nltk.download('stopwords')

# Specialized Libraries
import xgboost as xgb
import hmmlearn.hmm
from hmmlearn.hmm import GaussianHMM
from sklearn_crfsuite import CRF

# Natural Language Processing (NLP) Libraries
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModel

# Progress Bar
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
from tqdm import tqdm

import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras_tuner import RandomSearch


In [None]:
## Options
pd.set_option("max_colwidth", None)

# Logger

In [None]:
class MyLogger:
    """
        Initializes the MyLogger with a specified log file for logging messages.
        
        :param log_file: str, path to the log file (default: 'app.log')
    """
    def __init__(self, log_file='app.log'):
        self.log_file = log_file
        self._initialize_logger()

    def _initialize_logger(self):
        """
        Initializes the MyLogger with a specified log file for logging messages.
        
        :param log_file: str, path to the log file (default: 'app.log')
        """
        if os.path.exists(self.log_file):
            file_mode = 'a'
        else:
            file_mode = 'w'

        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)

        file_handler = logging.FileHandler(self.log_file, mode=file_mode, encoding='utf-8')
        file_handler.setLevel(logging.INFO)

        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)

        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)

        if self.logger.hasHandlers():
            self.logger.handlers.clear()

        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)

    def log_message(self, message):
        """
        Logs an info-level message to the current log file and console.
        
        :param message: str, the message to be logged
        """
        self.logger.info(message)

    def change_log_file(self, new_log_file):
        """
        Changes the log file path and reinitializes the logger with the new file.
        
        :param new_log_file: str, the new path to the log file
        """
        self.log_file = new_log_file
        self._initialize_logger()

logger = MyLogger()

os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Feature Build

In [None]:
class FeatureBuilder:
    def __init__(self, method="tfidf", save_dir="data/processed", reduce_dim=None, n_components=100):
        """
        Initializes the FeatureBuilder with a specified feature engineering method.
        
        :param method: str, feature engineering method ('tfidf', 'count', 'word2vec', 'bert', etc.)
        :param save_dir: str, directory to save processed features
        """
        self.method = method
        self.save_dir = save_dir
        self.reduce_dim = reduce_dim
        self.n_components = n_components
        os.makedirs(save_dir, exist_ok=True)
        
        # Define models for vectorization
        if method == "tfidf":
            self.vectorizer = TfidfVectorizer(max_features=2000, stop_words="english")
        elif method == "count":
            self.vectorizer = CountVectorizer(max_features=2000)
        elif method == "binary_count":
            self.vectorizer = CountVectorizer(binary=True, max_features=2000)
        elif method == "word2vec":
            self.word2vec_model = api.load("word2vec-google-news-300")  # Pretrained Google News Word2Vec
        elif method == "glove":
            self.glove_model = api.load("glove-wiki-gigaword-100")  # Pretrained GloVe embeddings
        elif method == "bert":
            self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            self.bert_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    
        # Initialize dimensionality reduction
        if self.reduce_dim == "pca":
            self.reducer = PCA(n_components=self.n_components)
        elif self.reduce_dim == "lda":
            self.reducer = LDA(n_components=min(self.n_components, 1))  # LDA needs class labels, adjust accordingly
    
    def _get_word2vec_vector(self, doc):
        """
        Extracts the average Word2Vec embedding for a document.

        :param doc: str, the document text
        :return: np.array, the averaged Word2Vec embedding
        """
        tokens = doc.split()
        word_vectors = []
        for token in tokens:
            if token in self.word2vec_model:  # Access word directly
                word_vectors.append(self.word2vec_model[token])  # No need for '.wv'
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(self.word2vec_model.vector_size)

    def _get_glove_vector(self, doc):
        """
        Extracts the average GloVe embedding for a document.

        :param doc: str, the document text
        :return: np.array, the averaged GloVe embedding
        """
        tokens = doc.split()
        word_vectors = []
        for token in tokens:
            if token in self.glove_model:  # Same for GloVe
                word_vectors.append(self.glove_model[token])  # Use directly without '.wv'
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(self.glove_model.vector_size)

    def _get_bert_embedding(self, doc):
        """
        Extracts the BERT embedding for a document.

        :param doc: str, the document text
        :return: np.array, the BERT embedding
        """
        inputs = self.tokenizer(doc, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        return outputs.pooler_output.squeeze(0).numpy()
    
    def fit(self, texts, labels=None):
        """
        Fits the model to the text data by computing necessary statistics (e.g., vocabulary, embeddings).

        :param texts: list, raw text data
        :return: None
        """
        if self.method in ["tfidf", "count", "binary_count"]:
            self.vectorizer.fit(texts)
        elif self.method in ["word2vec", "glove", "bert"]:
            pass

        # if self.reduce_dim == "lda" and labels is not None:
        #     features = self.vectorizer.transform(texts).toarray()
        #     self.reducer.fit(features, labels)
        # elif self.reduce_dim == "pca":
        #     features = self.vectorizer.transform(texts).toarray()
        #     self.reducer.fit(features)

    def transform(self, texts, labels=None):
        """
        Transforms new data based on the fitted model.

        :param texts: list, raw text data
        :return: transformed feature matrix
        """
        if self.method in ["tfidf", "count", "binary_count"]:
            # Transform the new data using the fitted vectorizer
            features = self.vectorizer.transform(texts).toarray()

        elif self.method == "word2vec":
            # Use the pre-trained Word2Vec model to generate embeddings
            word2vec_embeddings = []
            for doc in tqdm(texts, desc="Processing Word2Vec", unit="document"):
                word2vec_embeddings.append(self._get_word2vec_vector(doc))
            features = np.array(word2vec_embeddings)

        elif self.method == "glove":
            # Similar process for GloVe embeddings
            glove_embeddings = []
            for doc in tqdm(texts, desc="Processing GloVe", unit="document"):
                glove_embeddings.append(self._get_glove_vector(doc))
            features = np.array(glove_embeddings)

        elif self.method == "bert":
            # Use the pre-trained BERT model to generate embeddings
            bert_embeddings = []
            for doc in tqdm(texts, desc="Processing BERT", unit="document"):
                bert_embeddings.append(self._get_bert_embedding(doc))
            features = np.array(bert_embeddings)

        # Apply dimensionality reduction if enabled
        if self.reduce_dim and features is not None:
            if self.reduce_dim == "lda" and labels is not None:
                # features = self.vectorizer.transform(texts).toarray()
                self.reducer.fit(features, labels)
            elif self.reduce_dim == "pca":
                # features = self.vectorizer.transform(texts).toarray()
                self.reducer.fit(features)
            
            features = self.reducer.transform(features)

        return features

    def fit_transform(self, texts):
        """
        Fits and transforms the text data by first fitting the model and then transforming it.

        :param texts: list, raw text data
        :return: transformed feature matrix
        """
        self.fit(texts)  # First fit the model (compute parameters)
        return self.transform(texts)  # Then transform the data using the fitted model
    
    def _save_model(self):
        """Saves the fitted vectorizer/scaler for later use."""
        # Ensure the directory exists
        save_dir = self.save_dir if self.save_dir else "data/processed"
        os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist  
        
        if self.method in ["tfidf", "count", "binary_count"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_vectorizer.pkl")
            with open(file_path, "wb") as f:
                pickle.dump(self.vectorizer, f)
        elif self.method in ["word2vec", "glove"]:
            # Save the Word2Vec or GloVe model
            file_path = os.path.join(self.save_dir, f"{self.method}_model.pkl")
            with open(file_path, "wb") as f:
                if self.method == "word2vec":
                    pickle.dump(self.word2vec_model, f)
                elif self.method == "glove":
                    pickle.dump(self.glove_model, f)
        elif self.method == "bert":
            # Save the BERT tokenizer and model
            tokenizer_path = os.path.join(self.save_dir, "bert_tokenizer.pkl")
            model_path = os.path.join(self.save_dir, "bert_model.pkl")
            with open(tokenizer_path, "wb") as f:
                pickle.dump(self.tokenizer, f)
            with open(model_path, "wb") as f:
                pickle.dump(self.bert_model, f)
                
        if self.reduce_dim:
            reducer_path = os.path.join(self.save_dir, f"{self.reduce_dim}_reducer.pkl")
            with open(reducer_path, "wb") as f:
                pickle.dump(self.reducer, f)
    
    def _load_model(self):
        """Loads the previously saved vectorizer/scaler."""
        # Ensure the directory exists
        os.makedirs(self.save_dir, exist_ok=True)
        
        if self.method in ["tfidf", "count", "binary_count"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_vectorizer.pkl")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"No saved model found at {file_path}. Run `fit_transform` first.")
            with open(file_path, "rb") as f:
                self.vectorizer = pickle.load(f)
        elif self.method in ["word2vec", "glove"]:
            file_path = os.path.join(self.save_dir, f"{self.method}_model.pkl")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"No saved model found at {file_path}. Run `fit_transform` first.")
            with open(file_path, "rb") as f:
                self.word2vec_model = pickle.load(f)
        elif self.method == "bert":
            tokenizer_path = os.path.join(self.save_dir, "bert_tokenizer.pkl")
            model_path = os.path.join(self.save_dir, "bert_model.pkl")
            if not os.path.exists(tokenizer_path) or not os.path.exists(model_path):
                raise FileNotFoundError(f"No saved BERT model found at {tokenizer_path} or {model_path}. Run `fit_transform` first.")
            with open(tokenizer_path, "rb") as f:
                self.tokenizer = pickle.load(f)
            with open(model_path, "rb") as f:
                self.bert_model = pickle.load(f)
        
        if self.reduce_dim:
            reducer_path = os.path.join(self.save_dir, f"{self.reduce_dim}_reducer.pkl")
            with open(reducer_path, "rb") as f:
                self.reducer = pickle.load(f)

def build_vector_for_text(df_sampled, feature_methods, project_root):
    """
    Builds feature vectors for text data by extracting features using specified methods and splitting into train/test sets.
    
    This function processes a sampled DataFrame containing text data, applies various feature extraction methods,
    and returns feature dictionaries for training and testing sets along with their corresponding target labels.
    
    :param df_sampled: pandas.DataFrame, sampled DataFrame with 'text_clean' and 'target' columns
    :param feature_methods: list, list of feature extraction methods to apply (e.g., 'tfidf', 'word2vec')
    :param project_root: str, path to the project root directory for saving processed data
    :return: tuple, (X_train_features_dict, X_test_features_dict, y_train, y_test)
             - X_train_features_dict: dict, dictionary mapping methods to training feature DataFrames
             - X_test_features_dict: dict, dictionary mapping methods to testing feature DataFrames
             - y_train: pandas.Series, target labels for the training set
             - y_test: pandas.Series, target labels for the testing set
    """
    X_train_features_dict = {}
    X_test_features_dict = {}

    # 🔹 Step 1: First, split the DataFrame before feature extraction (to maintain X-y matching)
    df_train, df_test = train_test_split(df_sampled, test_size=0.2, random_state=42, stratify=df_sampled["target"])

    # Extract y_train and y_test **before feature extraction** to ensure data alignment
    y_train = df_train["target"].reset_index(drop=True)
    y_test = df_test["target"].reset_index(drop=True)

    logger.log_message("\n🔎 Running feature extraction...\n")
    for method in tqdm(feature_methods, desc="Feature Extraction Progress"):
        logger.log_message(f"\n🔍 Processing feature extraction using: {method}...")

        try:
            # Initialize FeatureBuilder for the current method
            feature_builder = FeatureBuilder(
                method=method,
                save_dir=os.path.join(project_root, "processed"),
                reduce_dim=None,
                n_components=50
            )

            # 🔹 Step 2: Extract features separately for train and test sets
            feature_builder.fit(df_sampled["text_clean"].tolist())
            X_train = feature_builder.transform(df_train["text_clean"].tolist())
            X_test = feature_builder.transform(df_test["text_clean"].tolist()) 

            # Ensure feature matrices are DataFrames
            X_train_features_dict[method] = pd.DataFrame(X_train)
            X_test_features_dict[method] = pd.DataFrame(X_test)

            logger.log_message(f"✅ {method} - Train shape: {X_train.shape}, Test shape: {X_test.shape}")

        except Exception as e:
            logger.log_message(f"❌ Error with {method}: {e}. Skipping this method.")

    return X_train_features_dict, X_test_features_dict, y_train, y_test


# Models utils

In [None]:
def create_population(num_features, population_size):
    """Creates an initial population of binary feature selectors."""
    return np.random.randint(2, size=(population_size, num_features))

def fitness_function(features, X_train, y_train):
    """Evaluates the fitness of a feature selection candidate."""
    selected_features = [i for i, f in enumerate(features) if f == 1]
    if not selected_features:  # Avoid empty feature sets
        return 0

    X_train_selected = X_train[:, selected_features]

    nb_model = GaussianNB(var_smoothing=1e-8)
    try:
        scores = cross_val_score(nb_model, X_train_selected, y_train, cv=5)
        return np.mean(scores)
    except ValueError as e:
        logger.log_message(f"Error during cross-validation: {e}")
        return 0

def crossover(parent1, parent2):
    """Performs single-point crossover."""
    point = np.random.randint(1, len(parent1) - 1)
    offspring1 = np.concatenate((parent1[:point], parent2[point:]))
    offspring2 = np.concatenate((parent2[:point], parent1[point:]))
    return offspring1, offspring2

def mutate(individual, mutation_rate=0.1):
    """Mutates an individual with a given probability."""
    for i in range(len(individual)):
        if np.random.rand() < mutation_rate:
            individual[i] = 1 - individual[i]
    return individual

def genetic_algorithm(X_train, y_train, X_test, y_test, model_save_path=None, population_size=20, num_generations=100, mutation_rate=0.1, crossover_rate=0.7):
    """Runs a genetic algorithm to optimize feature selection for Naive Bayes."""
    # Check if the model already exists
    if os.path.exists(model_save_path):
        logger.log_message(f"🔄 Loading existing model from {model_save_path}...")
        model_algorithm = joblib.load(model_save_path)
        return model_algorithm
    
    num_features = X_train.shape[1]
    population = create_population(num_features, population_size)

    for generation in range(num_generations):
        fitness_scores = [fitness_function(ind, X_train.values, y_train) for ind in population]
        
        # Normalize fitness scores to avoid division errors
        fitness_scores = np.array(fitness_scores)
        fitness_scores = np.clip(fitness_scores, 1e-5, None)
        
        probabilities = fitness_scores / np.sum(fitness_scores)

        # Select parents based on probabilities
        selected_indices = np.random.choice(np.arange(population_size), size=population_size, p=probabilities)
        selected_parents = [population[idx] for idx in selected_indices]

        next_generation = []
        for j in range(0, population_size, 2):
            if np.random.rand() < crossover_rate:
                offspring1, offspring2 = crossover(selected_parents[j], selected_parents[j + 1])
            else:
                offspring1, offspring2 = selected_parents[j], selected_parents[j + 1]
            
            next_generation.append(mutate(offspring1, mutation_rate))
            next_generation.append(mutate(offspring2, mutation_rate))

        population = next_generation  # Move to the next generation

    # Select the best individual
    best_individual = population[np.argmax(fitness_scores)]
    selected_features = [i for i, f in enumerate(best_individual) if f == 1]

    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    logger.log_message(f"Selected {len(selected_features)} features out of {num_features}")

    # Scaling the selected features
    scaler = MinMaxScaler()
    X_train_selected = scaler.fit_transform(X_train_selected)
    X_test_selected = scaler.transform(X_test_selected)

    # Train Naive Bayes with selected features
    nb_model = GaussianNB()
    nb_model.fit(X_train_selected, y_train)
    y_pred = nb_model.predict(X_test_selected)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    # ROC AUC can be computed if the model outputs probabilities
    # Handle models that do not support `predict_proba`
    if hasattr(nb_model, "predict_proba"):
        logger.log_message("Has predict_proba")
        y_prob = nb_model.predict_proba(X_test_selected)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
    elif hasattr(nb_model, "decision_function"):
        logger.log_message("Has decision_function")
        y_prob = nb_model.decision_function(X_test_selected)
        roc_auc = roc_auc_score(y_test, y_prob)
    else:
        logger.log_message("Does not have predict_proba or decision_function")
        roc_auc = "N/A"  # Not applicable for models like Perceptron

    # Print metrics
    logger.log_message(f"Accuracy: {accuracy:.4f}")
    logger.log_message(f"Precision: {precision:.4f}")
    logger.log_message(f"Recall: {recall:.4f}")
    logger.log_message(f"F1 Score: {f1:.4f}")
    if hasattr(nb_model, "predict_proba") or hasattr(nb_model, "decision_function"):
        logger.log_message(f"ROC AUC: {roc_auc:.4f}")
    else:
        logger.log_message("ROC AUC: N/A")
    
    # Save the trained model
    if model_save_path:
        joblib.dump(nb_model, model_save_path)
        logger.log_message(f'💾 Model saved to {model_save_path}')

# --------------------------------------------------

def generate_binary_classification_model(X, y, model_algorithm, hyperparameters, needs_scaled = False, model_save_path="best_model.pkl", img_save_path=None, img_loss_path=None):
    """
    Generating everything required for training and validation of a binary classification model

    Args:
        - X (Pandas DataFrame): Training features
        - y (Pandas DataFrame): Target values
        - model_algorithm (object): Model algorithm to train
        - hyperparameters (dict): Hyperparameters for tuning
        - needs_scaled (Boolean): Whether to scale the dataset
        - model_save_path (str): Path to save the best model
        - img_save_path (str): Path to save validation performance plot
        - img_loss_path (str): Path to save training loss plot
    """
    # Check if the model already exists
    if os.path.exists(model_save_path):
        logger.log_message(f"🔄 Loading existing model from {model_save_path}...")
        model_algorithm = joblib.load(model_save_path)
        return model_algorithm
    
    logger.log_message(f"🚀 Training new model: {model_algorithm.__class__.__name__}...")
    # Performing a scaling on the data if required
    if needs_scaled:
        
        # Instantiating the StandardScaler
        scaler = StandardScaler()
        
        # Performing a fit_transform on the dataset
        scaled_features = scaler.fit_transform(X)
        
        # Transforming the StandardScaler output back into a Pandas DataFrame
        X = pd.DataFrame(scaled_features, index = X.index, columns = X.columns)
        
    # Instantiating a GridSearch object with the inputted model algorithm and hyperparameters
    gridsearchcv = GridSearchCV(estimator = model_algorithm,
                                param_grid = hyperparameters)
    
    # Fitting the training data to the GridSearch object
    gridsearchcv.fit(X, y)
    
    # Printing out the best hyperparameters
    logger.log_message(f'Best hyperparameters: {gridsearchcv.best_params_}')
    
    # Instantiating a new model object with the ideal hyperparameters from the GridSearch job
    model_algorithm.set_params(**gridsearchcv.best_params_)
    
    # Creating a container to hold each set of validation metrics
    accuracy_scores, roc_auc_scores, f1_scores, precision_scores, recall_scores = [], [], [], [], []
    training_losses, validation_losses = [], []
    
    # Instantiating the K-Fold cross validation object
    k_fold = KFold(n_splits = 5)
    
    logger.log_message("\n🎯 Running K-Fold Cross-Validation...")
    for train_index, val_index in tqdm(k_fold.split(X), total=k_fold.get_n_splits(), desc="K-Fold Progress"):

        # Splitting the training set from the validation set for this specific fold
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # Fitting the X_train and y_train datasets to the model algorithm
        model_algorithm.fit(X_train, y_train)
        
        # Compute losses
        train_loss = get_training_loss(model_algorithm, X_train, y_train)
        val_loss = get_training_loss(model_algorithm, X_val, y_val)

        training_losses.append(train_loss)
        validation_losses.append(val_loss)

        # Getting inferential predictions for the validation dataset
        val_preds = model_algorithm.predict(X_val)

        # Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)
        val_accuracy = accuracy_score(y_val, val_preds)
        val_roc_auc_score = roc_auc_score(y_val, val_preds)
        val_f1_score = f1_score(y_val, val_preds)
        val_precision_score = precision_score(y_val, val_preds)
        val_recall_score = recall_score(y_val, val_preds)
        
        # Appending the validation scores to the respective validation metric container
        accuracy_scores.append(val_accuracy)
        roc_auc_scores.append(val_roc_auc_score)
        f1_scores.append(val_f1_score)
        precision_scores.append(val_precision_score)
        recall_scores.append(val_recall_score)
        
    # Print average validation scores
    logger.log_message(f'📊 Average Accuracy: {int(mean(accuracy_scores) * 100)}%')
    logger.log_message(f'📊 Average ROC AUC: {int(mean(roc_auc_scores) * 100)}%')
    logger.log_message(f'📊 Average F1 Score: {int(mean(f1_scores) * 100)}%')
    logger.log_message(f'📊 Average Precision: {int(mean(precision_scores) * 100)}%')
    logger.log_message(f'📊 Average Recall: {int(mean(recall_scores) * 100)}%')
    
    # New added
    model_algorithm.fit(X, y)

    # Save the trained model
    joblib.dump(model_algorithm, model_save_path)
    logger.log_message(f'💾 Model saved to {model_save_path}')
    
    # If img_save_path is provided, plot the validation scores
    if img_save_path:
        plt.figure(figsize=(10, 6))

        # Plot Accuracy
        plt.plot(range(1, len(accuracy_scores) + 1), accuracy_scores, label="Accuracy", marker='o')

        # Plot ROC AUC
        plt.plot(range(1, len(roc_auc_scores) + 1), roc_auc_scores, label="ROC AUC", marker='o')

        # Add labels and title
        plt.title("Validation Performance Across K-Folds")
        plt.xlabel("Fold Number")
        plt.ylabel("Score")
        plt.legend()

        # Save the plot to the specified path
        plt.savefig(img_save_path)
        plt.close()
        logger.log_message(f"📈 Plot saved to {img_save_path}")
        
    # Plot loss curves
    if img_loss_path:
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(training_losses) + 1), training_losses, label="Training Loss", marker='o')
        plt.plot(range(1, len(validation_losses) + 1), validation_losses, label="Validation Loss", marker='o')
        plt.title("Training & Validation Loss Across K-Folds")
        plt.xlabel("Fold Number")
        plt.ylabel("Loss")
        plt.legend()
        plt.savefig(img_loss_path)
        plt.close()
        logger.log_message(f"📉 Loss plot saved to {img_loss_path}")
    
    return model_algorithm

def get_training_loss(model, X_train, y_train):
    """
    Compute training loss based on model type.
    """
    # Models that expose their loss during training
    if hasattr(model, "best_score_"):  # XGBoost
        return -model.best_score_

    if hasattr(model, "loss_"):  # Perceptron (Hinge loss)
        return model.loss_

    # Probabilistic models (e.g., HMM, Naive Bayes)
    if hasattr(model, "score"):  
        return -model.score(X_train, y_train)  # Negative log-likelihood

    # Support Vector Machines (hinge loss)
    if isinstance(model, SVC):
        y_pred = model.decision_function(X_train)
        return np.mean(np.maximum(0, 1 - y_train * y_pred))  # Hinge loss

    # Logistic Regression (log loss)
    if isinstance(model, LogisticRegression):
        y_proba = model.predict_proba(X_train)
        return log_loss(y_train, y_proba)

    # Decision Tree, Random Forest: No direct loss, use log loss
    if isinstance(model, (DecisionTreeClassifier, RandomForestClassifier)):
        y_proba = model.predict_proba(X_train)
        return log_loss(y_train, y_proba)

    return None  # Loss not available

# --------------------------------------------------
def train_bayes_net(df, model_save_path):
    """
    Trains a Bayesian Network model for sentiment classification or loads an existing model from disk.
    
    This function either loads a pre-trained Bayesian Network model from the specified path or trains a new one
    using text data from a DataFrame. It builds the network structure, fits conditional probability tables (CPTs),
    evaluates the model on a test set, and logs performance metrics like accuracy, precision, recall, F1 score,
    and ROC AUC (if applicable).
    
    :param df: pandas.DataFrame, DataFrame containing 'text_clean' (cleaned text) and 'target' (sentiment labels) columns
    :param model_save_path: str, path to save or load the trained model
    :return: None, logs model performance metrics to the logger
    """
    if os.path.exists(model_save_path):
        logger.log_message("✅ Model found! Loading...")
        # reader = BIFReader(model_save_path)
        # model = reader.get_model()
        logger.log_message("✅ Model loaded successfully!")
    else:  
        df_sampled = df
        
        vectorizer = CountVectorizer(binary=True, max_features=100) 
        X = vectorizer.fit_transform(df_sampled['text_clean']).toarray()
        y = df_sampled['target'].values
        
        # Chia dữ liệu thành tập huấn luyện và kiểm tra
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Chuyển đổi thành DataFrame để sử dụng với pgmpy
        feature_names = vectorizer.get_feature_names_out()
        train_df = pd.DataFrame(X_train, columns=feature_names)
        train_df['target'] = y_train
        
        # Xây dựng cấu trúc Bayesian Network
        # Giả sử mỗi từ phụ thuộc vào 'Sentiment'
        edges = [('target', word) for word in feature_names]
        model = BayesianNetwork(edges)
        
        # Học các bảng xác suất có điều kiện (CPT) từ dữ liệu
        model.fit(train_df, estimator=MaximumLikelihoodEstimator)

        # Suy luận và đánh giá mô hình
        inference = VariableElimination(model)
        
        # joblib.dump(model, model_save_path)
        # with open(model_save_path, "w") as f:
        #     f.write(model.to_bif())
        
        # logger.log_message(f'💾 Model saved to {model_save_path}')

        # Hàm dự đoán sentiment cho tập dữ liệu
        def predict_sentiment(model, inference, X, feature_names):
            predictions = []
            for i in range(X.shape[0]):
                evidence = {feature_names[j]: X[i, j] for j in range(len(feature_names))}
                result = inference.map_query(variables=['target'], evidence=evidence)
                predictions.append(result['target'])
            return np.array(predictions)

        def predict_sentiment_proba(model, inference, X, feature_names):
            proba_predictions = []
            for i in range(X.shape[0]):
                evidence = {feature_names[j]: X[i, j] for j in range(len(feature_names))}
                result = inference.query(variables=['target'], evidence=evidence)
                
                # Extract probability of target = 1 (assuming binary classification: 0 or 1)
                prob_1 = result.values[1]  # Probabilities are stored as an array, index 1 corresponds to class 1
                proba_predictions.append(prob_1)
            return np.array(proba_predictions)
        
        # Dự đoán trên tập kiểm tra
        y_pred = predict_sentiment(model, inference, X_test, feature_names)

        # Đánh giá độ chính xác
        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        
        # ROC AUC can be computed if the model outputs probabilities
        # Handle models that do not support `predict_proba`
        if hasattr(model, "predict_proba"):
            logger.log_message("Has predict_proba")
            y_prob = model.predict_proba(X_test)[:, 1]  # Take the positive class probabilities
            roc_auc = roc_auc_score(y_test, y_prob)
        elif hasattr(model, "decision_function"):
            logger.log_message("Has decision_function")
            y_prob = model.decision_function(X_test)
            roc_auc = roc_auc_score(y_test, y_prob)
        else:
            logger.log_message("Does not have predict_proba or decision_function")
            y_proba = predict_sentiment_proba(model, inference, X_test, feature_names)
            roc_auc = roc_auc_score(y_test, y_proba)
            # roc_auc = "N/A"  # Not applicable for models like Perceptron

        # Print metrics
        logger.log_message("Model: Bayesian Network")
        logger.log_message("-" * 50)
        logger.log_message(f"Accuracy: {accuracy:.4f}")
        logger.log_message(f"Precision: {precision:.4f}")
        logger.log_message(f"Recall: {recall:.4f}")
        logger.log_message(f"F1 Score: {f1:.4f}")
        if hasattr(model, "predict_proba") or hasattr(model, "decision_function"):
            logger.log_message(f"ROC AUC: {roc_auc:.4f}")
        else:
            if roc_auc != "N/A":
                logger.log_message(f"ROC AUC: {roc_auc:.4f}")
            else:
                logger.log_message("ROC AUC: N/A")

def extract_features(text, word_features):
    """
    Extracts feature indices from text based on a predefined list of word features.
    
    :param text: str, input text to process
    :param word_features: list, list of unique words used as features
    :return: numpy.ndarray, array of indices corresponding to words found in word_features
    """
    words = text.split()  # Chuyển văn bản thành danh sách từ
    return np.array([word_features.index(word) for word in words if word in word_features])

def pad_sequence(seq, max_len):
    """
    Pads or truncates a sequence to a fixed length.
    
    :param seq: numpy.ndarray, input sequence to pad or truncate
    :param max_len: int, desired length of the output sequence
    :return: numpy.ndarray, padded or truncated sequence
    """
    if len(seq) >= max_len:
        return seq[:max_len]
    return np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=0)

def train_hmm(df, model_save_path):
    """
    Trains a Hidden Markov Model (HMM) for sentiment classification using text data.
    
    This function preprocesses text data into sequences of feature indices, pads them to a fixed length,
    trains a Gaussian HMM, saves the model, and evaluates its performance on a test set with metrics
    like accuracy, precision, recall, F1 score, and ROC AUC (if applicable).
    
    :param df: pandas.DataFrame, DataFrame with 'text_clean' (cleaned text) and 'target' (sentiment labels) columns
    :param model_save_path: str, path to save the trained HMM model
    :return: None, logs model performance metrics to the logger
    """
    df_sampled = df
    
    # Tạo tập từ vựng (chỉ lấy 3000 từ phổ biến nhất)
    all_words = nltk.FreqDist(word.lower() for text in df_sampled["text_clean"] for word in text.split())
    word_features = list(all_words.keys())[:5000]  # Lấy 5000 từ phổ biến nhất

    # Chuyển đổi dữ liệu text thành dạng số
    X = [extract_features(text, word_features) for text in df_sampled["text_clean"]]
    y = df_sampled["target"].values  # Nhãn (0: negative, 1: positive)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # HMM yêu cầu chuỗi có độ dài giống nhau -> Padding độ dài cố định (50 từ)
    max_len = 50
    
    X_train = np.array([pad_sequence(seq, max_len) for seq in X_train])
    X_test = np.array([pad_sequence(seq, max_len) for seq in X_test])

    # Huấn luyện HMM cho từng class (pos và neg)
    hmm = hmmlearn.hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=100)
    hmm.fit(X_train)
    
    joblib.dump(hmm, model_save_path)
    logger.log_message(f'💾 Model saved to {model_save_path}')
    
    y_pred = hmm.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    # ROC AUC can be computed if the model outputs probabilities
    # Handle models that do not support `predict_proba`
    if hasattr(hmm, "predict_proba"):
        logger.log_message("Has predict_proba")
        y_prob = hmm.predict_proba(X_test)[:, 1]  # Take the positive class probabilities
        roc_auc = roc_auc_score(y_test, y_prob)
    elif hasattr(hmm, "decision_function"):
        logger.log_message("Has decision_function")
        y_prob = hmm.decision_function(X_test)
        roc_auc = roc_auc_score(y_test, y_prob)
    else:
        logger.log_message("Does not have predict_proba or decision_function")
        roc_auc = "N/A"  # Not applicable for models like Perceptron

    # Print metrics
    logger.log_message("Model: HMM")
    logger.log_message("-" * 50)
    logger.log_message(f"Accuracy: {accuracy:.4f}")
    logger.log_message(f"Precision: {precision:.4f}")
    logger.log_message(f"Recall: {recall:.4f}")
    logger.log_message(f"F1 Score: {f1:.4f}")
    if hasattr(hmm, "predict_proba") or hasattr(hmm, "decision_function"):
        logger.log_message(f"ROC AUC: {roc_auc:.4f}")
    else:
        logger.log_message("ROC AUC: N/A")
    
def train_graphical_model(df, model_name, model_save_path):
    """
    Trains a graphical model (HMM or Bayesian Network) based on the specified model name.
    
    :param df: pandas.DataFrame, DataFrame with 'text_clean' and 'target' columns
    :param model_name: str, name of the model to train ('hmm' or 'bayesnet')
    :param model_save_path: str, path to save the trained model
    :return: None, delegates training to the appropriate function
    """
    if model_name == "hmm":
        train_hmm(df, model_save_path)
    elif model_name == "bayesnet":
        train_bayes_net(df, model_save_path)

# --------------------------------------------------

def train_cnn_lstm(texts, labels, vocab_size=10000, max_length=500, embedding_dim=300, num_trials=10, epochs=20):
    """
    Trains a CNN-LSTM sentiment analysis model on given text data.

    Parameters:
        texts (list): List of sentences (raw text).
        labels (list): List of binary sentiment labels (0 for negative, 1 for positive).
        vocab_size (int): Size of vocabulary for tokenization.
        max_length (int): Maximum sequence length for padding.
        embedding_dim (int): Dimension of the word embedding layer.
        num_trials (int): Number of hyperparameter tuning trials.
        epochs (int): Number of training epochs.

    Returns:
        Trained Keras model with the best hyperparameters.
    """
    # **Step 1: Text Preprocessing**
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    X_data = pad_sequences(sequences, maxlen=max_length, padding="pre")
    y_data = np.array(labels)  # Convert labels to NumPy array

    # **Step 2: Split Data for Training & Testing**
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

    # **Step 3: Build Model Function**
    def build_model(hp):
        """
        Builds a CNN-LSTM hybrid model with tunable hyperparameters for binary classification.
        
        This function constructs a Keras Sequential model with an embedding layer, two CNN blocks 
        (Conv1D, BatchNormalization, MaxPooling1D), a Bidirectional LSTM layer, and dense layers. 
        Hyperparameters such as filter sizes, kernel sizes, LSTM units, dense units, dropout rate, 
        and learning rate are tuned using the Keras Tuner HyperParameters object.
        
        :param hp: keras.tuner.HyperParameters, object containing hyperparameter choices for tuning
        :return: keras.Model, compiled CNN-LSTM model ready for training
        """
        model = keras.Sequential()

        # **Embedding Layer**
        model.add(layers.Embedding(
            input_dim=vocab_size, 
            output_dim=embedding_dim, 
            input_length=max_length
        ))

        # **CNN Block 1**
        model.add(layers.Conv1D(
            filters=hp.Int('filters_1', min_value=64, max_value=256, step=64),
            kernel_size=hp.Choice('kernel_size_1', values=[3, 5, 7]),
            activation="relu",
            padding="same"
        ))
        model.add(layers.BatchNormalization())
        model.add(layers.MaxPooling1D(pool_size=2))

        # **CNN Block 2**
        model.add(layers.Conv1D(
            filters=hp.Int('filters_2', min_value=128, max_value=512, step=128),
            kernel_size=hp.Choice('kernel_size_2', values=[3, 5]),
            activation="relu",
            padding="same"
        ))
        model.add(layers.BatchNormalization())
        model.add(layers.MaxPooling1D(pool_size=2))

        # **Bidirectional LSTM Layer**
        model.add(layers.Bidirectional(layers.LSTM(
            units=hp.Int('lstm_units', min_value=64, max_value=256, step=64),
            activation="tanh",
            return_sequences=False
        )))

        # **Fully Connected Layer**
        model.add(layers.Dense(
            units=hp.Int('dense_units', min_value=128, max_value=512, step=128),
            activation="relu"
        ))
        model.add(layers.Dropout(rate=hp.Float('dropout', min_value=0.3, max_value=0.6, step=0.1)))

        # **Output Layer**
        model.add(layers.Dense(1, activation="sigmoid"))

        # **Compile Model**
        model.compile(
            optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[5e-4, 1e-4, 5e-5, 1e-5])),
            loss="binary_crossentropy",
            metrics=["accuracy"]
        )

        return model

    # **Step 4: Initialize Keras Tuner**
    tuner = kt.RandomSearch(
        build_model,
        objective="val_accuracy",
        max_trials=num_trials,
        executions_per_trial=1,
        directory="tuner_results",
        project_name="cnn_lstm_tuning"
    )

    logger.log_message("\n🔍 Running Hyperparameter Tuning...")
    tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32, validation_split=0.2, verbose=1)

    # **Step 5: Retrieve Best Model**
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    best_model = tuner.hypermodel.build(best_hps)

    # **Step 6: Final Training with Best Model**
    logger.log_message("\n🚀 Training Final Model...")
    history = best_model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32, validation_split=0.2, verbose=1)
    
    # **Step 7: Predict on Validation Set**
    y_pred_prob = best_model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int)

    # **Step 8: Compute Metrics**
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    # **Step 9: Store Results**
    results = {
        "loss": history.history["loss"],
        "val_loss": history.history["val_loss"],
        "accuracy": history.history["accuracy"],
        "val_accuracy": history.history["val_accuracy"],
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "roc_auc": roc_auc
    }

    logger.log_message(f'🔹 loss: {history.history["loss"][-1]}')
    logger.log_message(f'🔹 val_loss: {history.history["val_loss"][-1]}')
    logger.log_message(f'🔹 accuracy: {history.history["accuracy"][-1]}')
    logger.log_message(f'🔹 val_accuracy: {history.history["val_accuracy"][-1]}')
    logger.log_message(f'🔹 precision: {precision}')
    logger.log_message(f'🔹 recall: {recall}')
    logger.log_message(f'🔹 f1_score: {f1}')
    logger.log_message(f'🔹 roc_auc: {roc_auc}')

    # **Step 10: Save the Best Model**
    best_model.save("best_cnn_lstm.keras")

    logger.log_message("\n✅ Model Training and Save Complete!")
    
    return best_model, results

# --------------------------------------------------

def train_general_model(df, doc_lst, label_lst, model_name_lst, feature_methods, model_dict, param_dict, X_train_features_dict, X_test_features_dict, y_train, y_test):
    """
    Trains multiple machine learning models using various feature extraction methods and logs the process.
    
    This function iterates over a list of model names and feature extraction methods, training either 
    specialized models (CNN, LSTM, HMM, BayesNet) or general models (including Genetic Algorithm) 
    based on the provided data and configurations. It handles exceptions and logs progress and errors.
    
    :param df: pandas.DataFrame, DataFrame containing text data (used for graphical models)
    :param doc_lst: list, list of text documents (used for CNN/LSTM models)
    :param label_lst: list, list of corresponding labels (used for CNN/LSTM models)
    :param model_name_lst: list, list of model names to train (e.g., 'cnn', 'lstm', 'hmm', 'bayesnet', 'GA', etc.)
    :param feature_methods: list, list of feature extraction methods (e.g., 'tfidf', 'word2vec')
    :param model_dict: dict, dictionary mapping model names to their corresponding class/function
    :param param_dict: dict, dictionary mapping model names to their hyperparameter settings
    :param X_train_features_dict: dict, dictionary of training feature matrices for each method
    :param X_test_features_dict: dict, dictionary of testing feature matrices for each method
    :param y_train: numpy.ndarray or pandas.Series, training labels
    :param y_test: numpy.ndarray or pandas.Series, testing labels
    :return: None, logs training progress and results
    """
    logger.log_message("\n🔎 Running feature extraction and model training loop...\n")
    
    for model_name in model_name_lst:
        logger.log_message(f"\n🚀 Training {model_name} models...\n")

        try:
            if model_name == "cnn" or model_name == "lstm":
                train_cnn_lstm(doc_lst, label_lst)
                
            elif model_name == "hmm" or model_name == "bayesnet":
                train_graphical_model(
                    df, 
                    model_name, 
                    model_save_path=f"best_{model_name}.pkl"
                )
                
            else:
                for method in feature_methods:
                    logger.log_message(f"🔎 Training with Method: {method}...")
                    
                    if model_name == "GA":
                        genetic_algorithm(
                            X_train_features_dict[method], 
                            y_train, 
                            X_test_features_dict[method], 
                            y_test, 
                            model_save_path=f"best_{model_name}_{method}.pkl"
                        )
                    
                    else:
                        model_api = model_dict[model_name]()
                        model_params = param_dict[model_name]
                        
                        generate_binary_classification_model(
                            X=X_train_features_dict[method], 
                            y=y_train, 
                            model_algorithm=model_api, 
                            hyperparameters=model_params, 
                            needs_scaled=False, 
                            model_save_path=f"best_{model_name}_{method}.pkl",
                            img_save_path=f"best_{model_name}_{method}.png",
                            img_loss_path=f"best_{model_name}_{method}_loss.png"
                        )
                        
        except Exception as e:
            logger.log_message(f"❌ Error with {method}: {e}")

# --------------------------------------------------  

def predict_general_model(model_names, feature_methods, X_test_features_dict, y_test, output_dir):# Predict for each model
    """
    Makes predictions using pre-trained models and evaluates their performance on test data.
    
    This function iterates over a list of model names and feature extraction methods, loads the 
    corresponding pre-trained models, generates predictions on the test set, and computes 
    performance metrics (accuracy, precision, recall, F1 score, and ROC AUC if applicable). 
    It handles both deep learning models (e.g., CNN) and traditional machine learning models, 
    skipping models that were already tested during training (e.g., GA, HMM, BayesNet, LSTM).
    
    :param model_names: list, list of model names to predict with (e.g., 'cnn', 'GA', 'hmm', etc.)
    :param feature_methods: list, list of feature extraction methods (e.g., 'tfidf', 'word2vec')
    :param X_test_features_dict: dict, dictionary of test feature matrices for each method
    :param y_test: numpy.ndarray or pandas.Series, true labels for the test set
    :param output_dir: str, directory path where pre-trained models are stored
    :return: None, logs prediction results and performance metrics
    """
    for model_name in model_names:
        if model_name in ["GA", "hmm", "bayesnet", "lstm"]:
            logger.log_message(f"Already trained and tested model: {model_name}")
            continue
        
        for method in feature_methods:
            logger.log_message(f"🔎 Predicting with Model: {model_name}, Method: {method}...")
            
            try:
                if model_name in ["cnn"]:
                    # Load the saved deep learning model
                    model_filename = os.path.join(output_dir, f"best_{model_name}.keras")
                    model = tf.keras.models.load_model(model_filename)

                    # Retrieve and reshape features for CNN/LSTM
                    X_test_features = np.array(X_test_features_dict[method])
                    if model_name == "lstm":
                        input_shape = (1, X_test_features.shape[1])
                        X_test_features = X_test_features.reshape(X_test_features.shape[0], *input_shape)
                    else:
                        input_shape = (X_test_features.shape[1], 1)
                        X_test_features = X_test_features.reshape(-1, X_test_features.shape[1], 1)

                    # Make predictions
                    y_prob = model.predict(X_test_features).flatten()
                    y_pred = (y_prob > 0.5).astype(int)

                else:  # Handle Machine Learning models
                    # Load the saved model
                    model_filename = os.path.join(output_dir, f"best_{model_name}_{method}.pkl")
                    with open(model_filename, 'rb') as model_file:
                        model = joblib.load(model_file)

                    # Make predictions
                    y_pred = model.predict(X_test_features_dict[method])
                    
                    # ROC AUC can be computed if the model outputs probabilities
                    # Handle models that do not support `predict_proba`
                    if hasattr(model, "predict_proba"):
                        y_prob = model.predict_proba(X_test_features_dict[method])[:, 1]  # Take the positive class probabilities
                    elif hasattr(model, "decision_function"):
                        y_prob = model.decision_function(X_test_features_dict[method])
                    else:
                        y_prob = None

                # Compute metrics
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"

                logger.log_message(f"Model: {model_name}")
                logger.log_message(f"Method: {method}")
                logger.log_message("-" * 50)
                logger.log_message(f"Accuracy: {accuracy:.4f}")
                logger.log_message(f"Precision: {precision:.4f}")
                logger.log_message(f"Recall: {recall:.4f}")
                logger.log_message(f"F1 Score: {f1:.4f}")
                logger.log_message(f"ROC AUC: {roc_auc if roc_auc != 'N/A' else 'N/A'}")
                    
            except Exception as e:
                logger.log_message(f"❌ Error while predicting for {model_name} with {method}: {e}")

            
        logger.log_message("%" * 50)
        logger.log_message("%" * 50)


# Do here

## Dict

In [None]:
# Dictionary for models
MODEL_DICT = {
    "decision_tree": DecisionTreeClassifier,
    "perceptron": Perceptron,
    "mlp": MLPClassifier,
    "bayesian": GaussianNB,
    "random_forest": RandomForestClassifier,
    "xgboost": xgb.XGBClassifier,
    "svm": SVC,
    "logistic_regression": LogisticRegression
} 

# Dictionary for model parameters
MODEL_PARAMS = {
    "decision_tree": {
        "criterion": ["gini", "entropy"],
        "max_depth": [10, 20, 30, 40],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2"]
    },
    
    "perceptron": {
        "max_iter": [1000, 2000],
        "tol": [1e-3, 1e-4],
        "eta0": [0.001, 0.01, 0.1],
        "penalty": [None, "l2", "l1"],
        "alpha": [0.0001, 0.001, 0.01]
    },
    
    "mlp": {
        "hidden_layer_sizes": [(100,)],
        "activation": ["tanh", "logistic"],
        "solver": ["sgd"],
        "alpha": [0.001, 0.01],
        "batch_size": [32],
        "max_iter": [1000],
    },
    
    "bayesian": {
        "priors": [None, [0.5, 0.5], [0.4, 0.6], [0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.05, 0.95]],
        "var_smoothing": [1e-9, 1e-8, 1e-7]
    },
    
    "random_forest": {
        "n_estimators": [100],
        "max_depth": [10],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", "log2"]
    },
    
    "xgboost": {
        "n_estimators": [100, 150],
        "learning_rate": [0.001, 0.01, 0.1],
        "max_depth": [10, 15]
    },
    
    "svm": {
        "kernel": ["linear"],
        "C": [0.1, 1],
        "gamma": ["scale", "auto"]
    },
    
    "logistic_regression": {
        "penalty": ["l1", "l2", "elasticnet", None],
        "C": [0.1, 1.0, 10.0],
        "max_iter": [1000, 2000]
    }
}

# Dictionary for dimensionality reduction methods
DIMENSIONALITY_REDUCTION_DICT = {
    "pca": PCA,
    "lda": LDA,
}

## Load dataset

In [None]:
dataset_path = "/kaggle/input"
# df = pd.read_csv(f"{dataset_path}/final_clean_no_neutral_no_duplicates.csv")
df = pd.read_csv(f"{dataset_path}/final_clean_no_neutral_no_duplicates_v1.csv")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df["target"] = df["target"].replace(4, 1)

In [None]:
output_dir = "/kaggle/working"

In [None]:
feature_methods = ["count", "tfidf", "word2vec", "glove"]
df_sampled = df.sample(n=1000, random_state=42)
# df_sampled = df

In [None]:
doc_lst = df_sampled["text_clean"].tolist()
label_lst = df_sampled["target"].tolist()

In [None]:
X_train_features_dict, X_test_features_dict, y_train, y_test = build_vector_for_text(df_sampled, feature_methods, output_dir)

In [None]:
model_name_lst = [
    "decision_tree", # ok
    "random_forest", # ok
    "xgboost", 
    "perceptron", # ok
    "mlp", # lau but ok
    "lstm",
    "bayesian",
    "GA",
    "hmm",
    "bayesnet",
    "logistic_regression",
    "svm"
]

# Train - Test

In [None]:
train_general_model(df_sampled, doc_lst, label_lst, model_name_lst, feature_methods, MODEL_DICT, MODEL_PARAMS, X_train_features_dict, X_test_features_dict, y_train, y_test)

In [None]:
predict_general_model(model_name_lst, feature_methods, X_test_features_dict, y_test, output_dir)

# Remove output

In [None]:
# import shutil
# import os

# # Path to the working directory
# working_dir = '/kaggle/working/'

# # Remove all files and subdirectories in the working directory
# for filename in os.listdir(working_dir):
#     file_path = os.path.join(working_dir, filename)
#     if os.path.isfile(file_path):
#         os.remove(file_path)
#     else:
#         shutil.rmtree(file_path)

# logger.log_message("All outputs cleared from Kaggle working directory.")


In [None]:
# import os

# # Path to the working directory
# working_dir = '/kaggle/working/'

# # Loop through files in the working directory
# for filename in os.listdir(working_dir):
#     if filename.endswith(".zip"):  # Check if it's a zip file
#         file_path = os.path.join(working_dir, filename)
#         os.remove(file_path)  # Remove the zip file
#         logger.log_message(f"Removed: {file_path}")

# logger.log_message("✅ All zip files removed from Kaggle working directory.")


# Zip all

In [None]:
# import shutil
# import os
# import zipfile

# # Define working directory and zip file path
# working_dir = "/kaggle/working/"
# zip_file_path = "/kaggle/working/output_files.zip"

# # Create a zip archive containing only specific files
# with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
#     for filename in os.listdir(working_dir):
#         if filename.endswith(".log") or filename.endswith(".png") or filename.endswith(".pkl"):
#             file_path = os.path.join(working_dir, filename)
#             zipf.write(file_path, filename)  # Save file with its name (without full path)

# logger.log_message(f"✅ Selected outputs zipped into: {zip_file_path}")