# Import

In [None]:
%pip install tensorflow keras


In [None]:
%pip install keras-tuner


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%pip install hmmlearn
%pip install pgmpy

In [None]:
%pip install keras-nlp --upgrade

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm
import keras_tuner as kt
from tensorflow.keras.models import load_model
import keras_nlp

In [None]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import hmmlearn.hmm
from hmmlearn.hmm import GaussianHMM
from sklearn_crfsuite import CRF
from sklearn.metrics import log_loss, hinge_loss, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

In [None]:
## Options
pd.set_option("max_colwidth", None)

In [None]:
# Get the absolute path to the 'src' directory
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(project_root)
print(project_root)

In [None]:
from src.features.build_features_utils import *  # Assuming build_features_utils is inside build_features.py
from src.models.models_utils import *  # Assuming utils.py exists inside src/models/

# Dict

In [None]:
# Dictionary for models
MODEL_DICT = {
    "decision_tree": DecisionTreeClassifier,
    "perceptron": Perceptron,
    "mlp": MLPClassifier,
    "bayesian": GaussianNB,
    "random_forest": RandomForestClassifier,
    "xgboost": xgb.XGBClassifier,
    "svm": SVC,
    "logistic_regression": LogisticRegression,
    "hmm": lambda: hmmlearn.hmm.GaussianHMM(n_components=3),
    "crf": lambda: CRF(
        algorithm="lbfgs",  
        max_iterations=100,  
        all_possible_transitions=True # not tested yet
    ),
    "bayes_network": BayesianNetworkClassifier, # Loc defined
} 

# Dictionary for model parameters
MODEL_PARAMS = {
    # "decision_tree": {
    #     "criterion": ["gini", "entropy"],
    #     "max_depth": [10, 20],
    #     "min_samples_split": [2, 5],
    #     "min_samples_leaf": [1, 2],
    #     "max_features": ["sqrt", "log2"]
    # },
    
    "decision_tree": {
        "criterion": ["gini", "entropy"],
        "max_depth": [10, 20, 30, 40],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2"]
    },
    
    # "perceptron": {
    #     "max_iter": [1000, 2000],
    #     "tol": [1e-3],
    #     "eta0": [0.001],
    #     "penalty": ["l2"],
    #     "alpha": [0.0001, 0.001]
    # },
    
    "perceptron": {
        "max_iter": [1000, 2000],
        "tol": [1e-3, 1e-4],
        "eta0": [0.001, 0.01, 0.1],
        "penalty": [None, "l2", "l1"],
        "alpha": [0.0001, 0.001, 0.01]
    },
    
    "mlp": {
        "hidden_layer_sizes": [(100,)],
        "activation": ["tanh", "logistic"],
        "solver": ["sgd"],
        "alpha": [0.01],
        "batch_size": [32],
        "max_iter": [2000],
    },
    
    # "mlp": {
    #     "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 100)],
    #     "activation": ["relu", "tanh", "logistic"],
    #     "solver": ["adam", "sgd"],
    #     "alpha": [0.0001, 0.001, 0.01],
    #     "batch_size": [32, 64, 128],
    #     "max_iter": [500, 1000],
    #     "learning_rate": ["constant", "invscaling", "adaptive"]
    # },
    
    "bayesian": {
        "priors": [None, [0.5, 0.5], [0.4, 0.6], [0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.05, 0.95]],
        "var_smoothing": [1e-9, 1e-8, 1e-7]
    },
    
    "random_forest": {
        "n_estimators": [100, 200],
        "max_depth": [10],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", "log2"],
        "bootstrap": [True, False]
    },
    
    # "random_forest": {
    #     "n_estimators": [50, 100, 200],
    #     "max_depth": [None, 10, 20, 30],
    #     "min_samples_split": [2, 5, 10],
    #     "min_samples_leaf": [1, 2, 4],
    #     "max_features": ["auto", "sqrt", "log2"],
    #     "bootstrap": [True, False]
    # },
    
    "xgboost": {
        "n_estimators": [100],
        "learning_rate": [0.01, 0.1],
        "max_depth": [6, 10]
    },
    
    # "xgboost": {
    #     "n_estimators": [100, 200, 300],
    #     "learning_rate": [0.01, 0.1, 0.2],
    #     "max_depth": [3, 6, 10],
    #     "subsample": [0.8, 1.0],
    #     "colsample_bytree": [0.8, 1.0],
    #     "gamma": [0, 0.1, 0.2]
    # },
    
    "svm": {
        "kernel": ["linear"],
        "C": [0.001, 0.01, 0.1, 1],
        "gamma": [0.1, 0.01, "scale", "auto"]
    },
    
    # "svm": {
    #     "kernel": ["linear", "rbf", "poly"],
    #     "C": [0.1, 1, 10, 100],
    #     "gamma": [0.1, 0.01, "scale", "auto"],
    #     "degree": [2, 3, 4]
    # },
    
    # "logistic_regression": {
    #     "penalty": ["l2"],
    #     "C": [0.1, 1.0],
    #     "max_iter": [1000, 2000]
    # },
    
    "logistic_regression": {
        "penalty": ["l1", "l2", "elasticnet", None],
        "C": [0.1, 1.0, 10.0],
        "max_iter": [1000, 2000]
    },
    
    
    # "hmm": {
    #     "n_components": [2],  # Keep it small
    #     "covariance_type": ["diag"],  # Simpler covariance type
    #     "n_iter": [500],  # Reduce iterations
    #     "init_params": ["stmc"],  # Initialize start probabilities, transition matrix, and means/covariance
    #     "params": ["stmc"]
    # },
    
    "hmm": {
        "n_components": [2, 3, 4],
        "covariance_type": ["diag", "full", "tied"],
        "n_iter": [100, 200],
        "init_params": ["c", "s", "cs"],
        "params": ["c", "t", "ct"]
    },
    
    "bayes_network": {
        "structure": [None],
        "n_bins": [2],
        "strategy": ["kmeans"],
        "min_unique_values": [2],
        "max_features": [10]
    },
    
    # "crf": {
    #     "c1": [0.1, 0.01],  # L1 Regularization
    #     "c2": [0.1, 0.01],  # L2 Regularization
    #     "max_iterations": [50, 100]  # Limit iterations
    # }
}

# Dictionary for dimensionality reduction methods
DIMENSIONALITY_REDUCTION_DICT = {
    "pca": PCA,
    "lda": LDA,
}

# Load dataset

In [None]:
# Load dataset
dataset_path = os.path.join(project_root, "data", "final", "final_clean_no_neutral_no_duplicates_v1.csv")
df = pd.read_csv(dataset_path)


In [None]:
df.head()

In [None]:
# Replace target 4 with 1
df["target"] = df["target"].replace(4, 1)


# New API Call

In [None]:
feature_methods = ["tfidf", "count", "word2vec", "glove"]
df_sampled = df.sample(n=1000, random_state=42)

In [None]:
doc_lst = df_sampled["text_clean"].tolist()
label_lst = df_sampled["target"].tolist()

In [None]:
X_train_features_dict, X_test_features_dict, y_train, y_test = build_vector_for_text(df_sampled, feature_methods, project_root)

In [None]:
model_name_lst = [
    # "decision_tree", # ok
    # "random_forest", # ok
    # "xgboost", 
    # "perceptron", # ok
    # "mlp", # lau but ok
    # "lstm",
    "distilbert",
    # "bayesian",
    # "GA",
    # "hmm",
    # "bayesnet",
    # "logistic_regression",
    # "svm"
]

In [None]:
trained_model = os.path.join(project_root, "src", "models")

In [None]:
%pip uninstall tf-nightly
%pip install tensorflow


In [None]:
%pip show keras-nlp


In [None]:
train_general_model(df_sampled, doc_lst, label_lst, model_name_lst, feature_methods, MODEL_DICT, MODEL_PARAMS, X_train_features_dict, X_test_features_dict, y_train, y_test)

In [None]:
predict_general_model(model_name_lst, feature_methods, X_test_features_dict, y_test, trained_model)