In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True) # Mount google drive to load training and testing data


# This where all the Packages are cached instead or reinstalling them every new runtime
PACKAGES_DIR = '/content/drive/My Drive/Bachelor/pip_cache'

# This is where MERGED & PREPROCESSED & CLEANED Dataset is
PREPROCESSED_MERGED_DATASET_DIR = '/content/drive/MyDrive/Bachelor/Sentiment/PREPROCESSED_DATASET/'


# MACHINE LEARNING MODELS DIRECTORY
ML_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Sentiment/ML_MODELS/'


# This is where the figures are saved
FIGURES_DIR = '/content/drive/MyDrive/Bachelor/Sentiment/FIGURES/'


# Number of Figures
FIGURE_COUNTS = 1



Mounted at /content/drive


## Packages

In [2]:
!pip config set global.cache-dir /content/drive/My\ Drive/Bachelor/pip_cache


from IPython.display import clear_output
clear_output() # clear output window

In [3]:
# Packages 
# types in python
from typing import List, Tuple ,Dict ,Any , Union
import time
import string
import os
import random




from collections import Counter

# -- Arabic text dependencies
# from arabic_reshaper import reshape      # pip install arabic_reshaper
# from bidi.algorithm import get_display   # pip install python-bidi
# from ar_wordcloud import ArabicWordCloud




# Data Manipulating  & Preprocessing packages
import numpy as np
from sklearn.preprocessing import LabelEncoder 
import unicodedata # normlization of arabic letters encoding to be unicoded 
import pandas as pd
pd.set_option('display.max_colwidth', None) # Setting the display option to show the full width of columns in pandas dataframe.




# Model
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
import torch
#from autokeras import StructuredDataClassifier

# to save the model
import joblib

 
#import warnings
#warnings.simplefilter(action='ignore')
#warnings.filterwarnings("ignore")

# visualization library
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

# Random Seed
RANDOM_SEED=42
RANDOM_STATE=42



def set_seed(seed=42):
  RANDOM_SEED=seed
  RANDOM_STATE=seed
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark = False



# Set Matplotlib defaults
plt.style.use('ggplot')
#plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

## My Preprocessed Labeled Dataset
> Consists of Egyptian and Modern Standard Arabic (MSA) .

> 
```
{'LABEL': Value(dtype='int64', id=None),
 'TWEET': Value(dtype='string', id=None)}
```
> Label explaination :

```
SENTIMENT_TO_ID = {
   "positive": 0 ,
   "negative": 1,
   "neutral": 2,
}

ID_TO_SENTIMENT = {
   0:"positive" ,
   1:"negative" ,
   2:"neutral",
   } 
```

In [4]:
SENTIMENT_TO_ID = {
   "positive": 0 ,
   "negative": 1,
   "neutral": 2,
}

ID_TO_SENTIMENT = {
   0:"positive" ,
   1:"negative" ,
   2:"neutral",
}

In [5]:
# Dataset names 
WHOLE_DATASET_NAME='DATASET'
TRAIN_DATASET_NAME='TRAIN_DATASET'
VALIDDATION_DATASET_NAME='VALIDATION_DATASET'
TEST_DATASET_NAME='TEST_DATASET'


stemmed_data_files = {
              "dataset": f"{PREPROCESSED_MERGED_DATASET_DIR +WHOLE_DATASET_NAME}.csv",
              "train": f"{PREPROCESSED_MERGED_DATASET_DIR + TRAIN_DATASET_NAME}.csv",
              'validation':f'{PREPROCESSED_MERGED_DATASET_DIR+VALIDDATION_DATASET_NAME}.csv',
              "test": f"{PREPROCESSED_MERGED_DATASET_DIR +TEST_DATASET_NAME}.csv",
              }

In [6]:
df = pd.read_csv(stemmed_data_files['dataset'])
df.head()

Unnamed: 0,LABEL,TWEET
0,2,اوليمبياد جاي هكون لسه كلي
1,1,عجز مواز وصل ناتج محل عن لسه رقم وحش لسه تابع اوليمبياد
2,1,تنا وحش حظ هباب
3,0,جميع نريد تحقيق اهداف تونس حلو وحش مرم
4,2,اوليمبياد نظام حلو مواعيد مونديال مكانتش وحش مش حاج معقول


In [7]:
from sklearn.model_selection import train_test_split

# X and y are your data and labels, respectively
X_train, X_test, y_train, y_test = train_test_split(df['TWEET'], df['LABEL'], test_size=0.2, random_state=RANDOM_STATE,stratify=df['LABEL'],shuffle=True)

## ML Classifiers

### Constants

In [8]:
SENTIMENT_TO_ID = {
   "positive": 0 ,
   "negative": 1,
   "neutral": 2,
}

tfidf_param_dist = {
    'tfidf__max_features': [1000, 5000, 10000, 20000],
    'tfidf__ngram_range': [(1,1), (1,2)],
      }

CLASSIFIERS_METRICS = []

In [9]:
from typing import List, Dict

def find_name_in_list(name: str, dict_list: List[Dict]) -> bool:
    """
    Checks if a given classifier name exists in a list of dictionaries.
    
    Parameters:
    name (str): The name to search for.
    dict_list (List[Dict]): The list of dictionaries to search in.
    
    Returns:
    bool: True if the name exists in the list, False otherwise.
    """
    for dictionary in dict_list:
        if name == dictionary['name']:
            return True
    return False


### Visualization

In [10]:
from typing import List, Tuple
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(y_true: List[int], y_pred: List[int]) -> None:
    """
    Plot a confusion matrix using Seaborn heatmap.

    Parameters:
    y_true (List[int]): A list of true labels encoded as integers.
    y_pred (List[int]): A list of predicted labels encoded as integers.

    Returns:
    None
    """

    # Define the class labels and their corresponding IDs
    CLASSES = ["positive", "negative", "neutral"]
    SENTIMENT_TO_ID = {"positive": 0, "negative": 1, "neutral": 2}

    # Convert the integer labels to their corresponding class labels
    y_true = [CLASSES[i] for i in y_true]
    y_pred = [CLASSES[i] for i in y_pred]

    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=CLASSES)

    # Normalize the confusion matrix
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Create a pandas DataFrame from the normalized confusion matrix
    df_cm = pd.DataFrame(cm_norm, index=CLASSES, columns=CLASSES)

    # Plot the heatmap using Seaborn
    sns.heatmap(df_cm, annot=True, cmap='Blues', fmt='.2f')

    # Add labels and title to the plot
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()


### Pipeline

In [11]:
# Helper functions 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV


def search_params(model, X_train:pd.Series, y_train:pd.Series , param_dist: Dict[str, Any],name:str):
    MODEL_PATH=os.path.join(ML_MODEL_DIR, name+'.joblib'.replace(" ", "_"))
    PARAMS_PATH=os.path.join(ML_MODEL_DIR, name+'.pkl'.replace(" ", "_"))

    if os.path.exists(MODEL_PATH) and os.path.exists(PARAMS_PATH):
        try:
            with open(MODEL_PATH, 'rb') as f:
                text_classifier = joblib.load(f)
            # read the DataFrame from the pickle file
            with open(PARAMS_PATH, 'rb') as f:
                df = pd.read_pickle(f)
            text_classifier.fit(X_train, y_train)
            return (text_classifier,df)
        except Exception as e:
            print(f'Error loading saved model: {e}')
            os.remove(MODEL_PATH)
            os.remove(PARAMS_PATH)


    text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', model),
    ])

    # Create a RandomizedSearchCV object with the pipeline, hyperparameters, and cross-validation parameters
    random_search = RandomizedSearchCV(text_classifier, param_distributions=param_dist, n_iter=5, cv=5, verbose=3)
   
    # Fit the RandomSearchCV object to the data
    random_search.fit(X_train, y_train)



    best_params = {'name': name, **random_search.best_params_, 'best_score_': random_search.best_score_}


    if not os.path.exists(MODEL_PATH):
        # Save the model to a file
        with open(MODEL_PATH, 'wb') as f:
            joblib.dump(text_classifier, f)
        # Save the best params
        with open(PARAMS_PATH, 'wb') as f:
            pd.to_pickle(pd.DataFrame([best_params]), f)

    return (random_search.best_estimator_, pd.DataFrame([best_params]).set_index('name'))


### Metrics

In [12]:
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score



def compute_metrics(trained_model , X:pd.Series , labels:pd.Series , name:str) -> Dict[str, Any]:
  """  called each time the model is evaluated on the validation dataset
        @returns a dictionary of metrics such as accuracy , acc_score , F1 score, precision,recall,roc_score and auc_score."""
  global CLASSIFIERS_METRICS
  predictions = trained_model.predict(X)
  # f1-score
  f1_macro = f1_score(labels, predictions, average='macro')
  f1_weighted = f1_score(labels, predictions, average='weighted')
  # precision
  precision_macro = precision_score(labels, predictions, average='macro')
  precision_weighted = precision_score(labels, predictions, average='weighted')
  # recall
  recall_macro = recall_score(labels, predictions, average='macro')
  recall_weighted = recall_score(labels, predictions, average='weighted')

  acc_score=accuracy_score(labels,predictions)
  
  metrics={
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_macro": precision_macro,
        "precision_weighted": precision_weighted,
        "recall_macro": recall_macro,
        "recall_weighted": recall_weighted,
       'accuracy':acc_score,
       'name':name
        }

  if not find_name_in_list(name,CLASSIFIERS_METRICS):
    CLASSIFIERS_METRICS = CLASSIFIERS_METRICS+[metrics]


  return metrics


### Decision Tree Classifier
> In simple terms, a decision tree for sentiment analysis works by breaking down a piece of text, such as a tweet or a review, into smaller and smaller pieces based on different features, such as the presence of certain words or phrases. At each step, the decision tree evaluates whether a certain condition is true or false based on these features, and then moves to the next branch of the tree accordingly.

> For example, if the decision tree is trying to determine whether a tweet is positive or negative, it might start by looking at whether the tweet contains certain positive or negative words. If the tweet contains the word "happy", it might follow the branch of the tree that leads to a positive sentiment. If it contains the word "angry", it might follow the branch of the tree that leads to a negative sentiment.


##### Hyperparameters :
> `classifier__criterion`: This hyperparameter specifies the function to measure the quality of a split in the decision tree. The two options are gini (for the Gini impurity) and entropy (for the information gain).

> `classifier__max_depth`: This hyperparameter controls the maximum depth of the decision tree. A larger value can lead to overfitting, while a smaller value can lead to underfitting.

> `classifier__min_samples_split`: This hyperparameter specifies the minimum number of samples required to split an internal node. A smaller value can lead to overfitting, while a larger value can lead to underfitting.

> `classifier__min_samples_leaf`: This hyperparameter specifies the minimum number of samples required to be at a leaf node. A smaller value can lead to overfitting, while a larger value can lead to underfitting.


In [13]:
from sklearn.tree import DecisionTreeClassifier


name='Decesion Tree'
param_dist = {
          **tfidf_param_dist,
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth': [None, 5, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
}

Decesion_Tree_Classifier,Decesion_Tree_df = search_params(DecisionTreeClassifier(), X_train, y_train,param_dist=param_dist,name=name)

Decesion_Tree_metric = compute_metrics(Decesion_Tree_Classifier,X_test, y_test,name=name)

Decesion_Tree_df.head()

Unnamed: 0,name,tfidf__ngram_range,tfidf__max_features,classifier__min_samples_split,classifier__min_samples_leaf,classifier__max_depth,classifier__criterion,best_score_
0,Decesion Tree,"(1, 1)",10000,5,2,,entropy,0.613309


In [14]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

Unnamed: 0_level_0,f1_macro,f1_weighted,precision_macro,precision_weighted,recall_macro,recall_weighted,accuracy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Decesion Tree,0.653812,0.653736,0.654026,0.654189,0.653961,0.653657,0.653657


### Bernoulli Naive Bayes

> is a discrete probability distribution that describes the probability of a single binary outcome (success or failure) in a single trial of an experiment. It has a single parameter, usually denoted by p, which represents the probability of success.

> we start by representing each document (such as a movie review) as a set of binary features, where each feature represents whether a particular word appears in the document or not. We then calculate the likelihood of each feature given the class labels (in this case, positive or negative sentiment) based on a training set of labeled data.

> When classifying a new document, we use Bayes' theorem to calculate the probability of each class label given the features of the document. We then choose the class label with the highest probability as the predicted sentiment of the document.
##### Hyperparameters :
> `clf__alpha`: The smoothing parameter for the BernoulliNB classifier. This parameter controls the strength of the regularization. We test three values: 0.01, 0.1, and 1.0.

In [15]:
from sklearn.naive_bayes import  BernoulliNB

name='Bernoulli NB'
param_dist = {
          **tfidf_param_dist,
        'classifier__alpha': [0.01, 0.1, 1.0]
}



bernoulli_nb_Classifier,bernoulli_nb_df = search_params(BernoulliNB(), X_train, y_train, param_dist = param_dist , name = name)


bernoulli_nb_df_metrics = compute_metrics(bernoulli_nb_Classifier,X_test, y_test,name = name)

bernoulli_nb_df.head()

Unnamed: 0,name,tfidf__ngram_range,tfidf__max_features,classifier__alpha,best_score_
0,Bernoulli NB,"(1, 2)",20000,0.1,0.690861


In [16]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

Unnamed: 0_level_0,f1_macro,f1_weighted,precision_macro,precision_weighted,recall_macro,recall_weighted,accuracy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Decesion Tree,0.653812,0.653736,0.654026,0.654189,0.653961,0.653657,0.653657
Bernoulli NB,0.690205,0.690309,0.692879,0.6924,0.690027,0.690775,0.690775


### Multinomial Naive Bayes

> (Probabilistic Algorithm ) The classifier learns to predict the sentiment of a piece of text based on its bag-of-words representation. During training, the classifier estimates the probabilities of each word occurring in a positive, negative, or neutral document. These probabilities are used to compute the likelihood of a document belonging to each sentiment class.

##### Hyperparameters :
> `clf__alpha`: The smoothing parameter for the MultinomialNB classifier. This parameter controls the strength of the regularization. We test three values: 0.01, 0.1, and 1.0.

In [17]:
from sklearn.naive_bayes import MultinomialNB

name='Multinomial NB'
param_dist = {
          **tfidf_param_dist,
        'classifier__alpha': [0.01, 0.1, 1.0]
}



multinomial_nb_Classifier,multinomial_nb_df = search_params(MultinomialNB(), X_train, y_train, param_dist = param_dist , name = name)


multinomial_nb_df_metrics = compute_metrics(multinomial_nb_Classifier,X_test, y_test,name = name)

multinomial_nb_df.head()

Unnamed: 0,name,tfidf__ngram_range,tfidf__max_features,classifier__alpha,best_score_
0,Multinomial NB,"(1, 1)",20000,1.0,0.699066


In [18]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

Unnamed: 0_level_0,f1_macro,f1_weighted,precision_macro,precision_weighted,recall_macro,recall_weighted,accuracy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Decesion Tree,0.653812,0.653736,0.654026,0.654189,0.653961,0.653657,0.653657
Bernoulli NB,0.690205,0.690309,0.692879,0.6924,0.690027,0.690775,0.690775
Multinomial NB,0.699448,0.699483,0.702486,0.701679,0.698606,0.699418,0.699418


### Random Forest

> It is a type of ensemble learning method that builds multiple decision trees and combines their predictions to improve accuracy and reduce overfitting.

> In Random Forest for sentiment analysis, each decision tree is built using a random subset of features and a random subset of the training data. The goal of each tree is to predict the sentiment of a text (positive, negative, or neutral). The final prediction of the Random Forest model is based on the majority vote of the predictions made by the individual trees.

##### Hyperparameters :
> `classifier__n_estimators`: This hyperparameter specifies the number of decision trees to be built in the random forest. In the given example, we are searching over a range of values [50, 100, 200] to find the optimal number of trees.

> `classifier__max_depth`: This hyperparameter specifies the maximum depth of each decision tree in the random forest. In the given example, we are searching over a range of values [10, 20, None] to find the optimal maximum depth.

>`classifier__min_samples_split`: This hyperparameter specifies the minimum number of samples required to split an internal node in the decision tree. In the given example, we are searching over a range of values [2, 5, 10] to find the optimal minimum number of samples.

> `classifier__min_samples_leaf`: This hyperparameter specifies the minimum number of samples required to be at a leaf node in the decision tree. In the given example, we are searching over a range of values [1, 2, 4] to find the optimal minimum number of samples.

In [19]:
from sklearn.ensemble import RandomForestClassifier
name='RandomForest'
param_dist = {
          **tfidf_param_dist,
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]}



RandomForest_Classifier,RandomForest_df = search_params(RandomForestClassifier(), X_train, y_train,param_dist=param_dist,name=name)

RandomForest_df_metrics = compute_metrics(RandomForest_Classifier,X_test, y_test,name = name)

RandomForest_df.head()

Unnamed: 0,name,tfidf__ngram_range,tfidf__max_features,classifier__n_estimators,classifier__min_samples_split,classifier__min_samples_leaf,classifier__max_depth,best_score_
0,RandomForest,"(1, 2)",20000,100,10,4,,0.687042


In [20]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

Unnamed: 0_level_0,f1_macro,f1_weighted,precision_macro,precision_weighted,recall_macro,recall_weighted,accuracy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Decesion Tree,0.653812,0.653736,0.654026,0.654189,0.653961,0.653657,0.653657
Bernoulli NB,0.690205,0.690309,0.692879,0.6924,0.690027,0.690775,0.690775
Multinomial NB,0.699448,0.699483,0.702486,0.701679,0.698606,0.699418,0.699418
RandomForest,0.730649,0.730543,0.731292,0.731174,0.730529,0.73044,0.73044


### Support Vector Classifier (SVC)

>  the process of identifying and extracting the  of a piece of text.

>  During training, the classifier learns to separate the different sentiment categories based on their numerical features.

##### Hyperparameters :
> `C`: This hyperparameter represents the penalty parameter of the error term and controls the trade-off between achieving a low training error and a low testing error. It is represented as a positive float and can be tuned over a range of values.

>`kernel`: This hyperparameter represents the kernel function used to transform the input data into a higher-dimensional space. The kernel function can be linear, polynomial, or radial basis function (RBF). It can be tuned by specifying the kernel function to use.

> `gamma`: This hyperparameter represents the kernel coefficient for RBF kernel and controls the shape of the decision boundary. A higher gamma value leads to more complex decision boundaries. It is represented as a float and can be tuned over a range of values.

> `degree`: This hyperparameter represents the degree of the polynomial kernel function and is only relevant for polynomial kernels. It is represented as an integer and can be tuned over a range of values.

> `class_weight`: This hyperparameter represents the weighting factor for each class in the SVC Classifier. It can be set to 'balanced' to automatically adjust the weights based on the number of samples in each class.

In [None]:
from sklearn.svm import SVC
name='SVC Classifier'

param_dist = {
    **tfidf_param_dist,
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'poly', 'rbf'],
    'classifier__gamma': [0.1, 1, 10],
    'classifier__degree': [2, 3, 4],
    'classifier__class_weight': [None, 'balanced']
         
}
SVC_Classifier,SVC_df = search_params(SVC(), X_train, y_train,param_dist=param_dist,name=name)
SVC_df_metrics = compute_metrics(SVC_Classifier,X_test, y_test,name = name)

SVC_df.head()

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

### LinearSVC

> The algorithm works by mapping input features (in our case, text data) to a high-dimensional feature space, where it tries to find a hyperplane that best separates the positive and negative instances. The hyperplane is defined by the support vectors, which are the points closest to the boundary. SVC aims to maximize the margin between the hyperplane and the support vectors, which helps it generalize well on new unseen data.


##### Hyperparameters :
> `classifier__C`: This parameter controls the penalty for misclassifying data points. Smaller values of C lead to a wider margin and more misclassified points, while larger values of C lead to a narrower margin and fewer misclassified points. In general, you should try different values of C to see which works best for your data.

>`classifier__loss`: This parameter specifies the loss function to be used. The two options here are 'hinge' and 'squared_hinge'. 'Hinge' is the standard loss function used for linear SVMs, while 'squared_hinge' is a smoothed version that is more robust to outliers. You can experiment with both options to see which one works best for your data.

In [None]:
from sklearn.svm import LinearSVC
name='LinearSVC'
param_dist = {
          **tfidf_param_dist,
          'classifier__C': [0.1, 1, 10],
          'classifier__loss': ['hinge', 'squared_hinge']
}
LinearSVC_Classifier,LinearSVC_df = search_params(LinearSVC(), X_train, y_train,param_dist=param_dist,name=name)
LinearSVC_df_metrics = compute_metrics(LinearSVC_Classifier,X_test, y_test,name = name)

LinearSVC_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

### NuSVC

> is a variant of SVM that uses a parameter nu to control the number of support vectors and the training error. The value of nu is a hyperparameter that must be tuned to achieve optimal performance. NuSVC is similar to C-SVC, another SVM variant, but has the advantage of automatically adjusting the number of support vectors based on the value of nu.


##### Hyperparameters :
> `classifier__C`: This parameter controls the penalty for misclassifying data points. Smaller values of C lead to a wider margin and more misclassified points, while larger values of C lead to a narrower margin and fewer misclassified points. In general, you should try different values of C to see which works best for your data.

>`classifier__loss`: This parameter specifies the loss function to be used. The two options here are 'hinge' and 'squared_hinge'. 'Hinge' is the standard loss function used for linear SVMs, while 'squared_hinge' is a smoothed version that is more robust to outliers. You can experiment with both options to see which one works best for your data.

In [None]:
from sklearn.svm import NuSVC
name = 'NuSVC'
param_dist = {
    **tfidf_param_dist,
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto'] 
}
NuSVC_Classifier, NuSVC_df = search_params(NuSVC(), X_train, y_train, param_dist=param_dist, name=name)
NuSVC_df_metrics = compute_metrics(NuSVC_Classifier, X_test, y_test, name=name)

NuSVC_df.head()


In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

### Stochastic Gradient Descent (SGD)

>  type of machine learning algorithm that is commonly used for classification tasks. It is a linear model that learns to classify input data by iteratively updating the weights of the model to minimize the error between the predicted labels and the actual labels of the data.

> During training, the SGD Classifier learns to identify patterns in the text data that are associated with each sentiment label. It does this by iteratively updating the weights of the model using a gradient descent algorithm, which minimizes the error between the predicted sentiment labels and the actual labels in the training data.

##### Hyperparameters :
> `alpha`: This is the regularization parameter that controls the strength of the penalty for high weight values. A higher value of alpha will result in a more conservative model with smaller weights, while a lower value of alpha will result in a more aggressive model with larger weights.

> `loss`: This is the loss function that is optimized during training. The most commonly used loss functions for binary sentiment analysis are 'hinge', 'log', and 'modified_huber'. The 'hinge' loss is used for linear SVMs, while the 'log' loss is used for logistic regression models.

> `penalty`: This is the type of regularization penalty to apply to the model weights. The most commonly used penalties are 'l1' and 'l2', which correspond to L1 and L2 regularization, respectively.

> `learning_rate`: This is the learning rate used by the SGD algorithm during training. A higher learning rate will result in larger weight updates and faster convergence, while a lower learning rate will result in smaller weight updates and slower convergence.

> `n_iter`: This is the number of iterations (epochs) to run during training. A higher number of iterations will result in a more accurate model, but will also take longer to train.

> `early_stopping`: This is a boolean flag that determines whether or not to use early stopping during training. Early stopping can help prevent overfitting by stopping the training process when the validation loss stops improving.

In [None]:
from sklearn.linear_model import SGDClassifier
name='SGD Classifier'
param_dist = {
          **tfidf_param_dist,
           'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__loss': ['hinge', 'log', 'modified_huber'],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__learning_rate': ['constant', 'optimal', 'invscaling'],
    'classifier__n_iter': [10, 50, 100],
    'classifier__early_stopping': [True, False]
         
}
SGD_Classifier,SGD_df = search_params(SGDClassifier(), X_train, y_train,param_dist=param_dist,name=name)
SGD_df_metrics = compute_metrics(SGD_Classifier,X_test, y_test,name = name)

SGD_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

### XGBoost

> he name "XGBoost" stands for "eXtreme Gradient Boosting", which refers to the fact that it uses an ensemble of decision trees to make predictions.

> The basic idea behind XGBoost is to combine the predictions of multiple decision trees in order to achieve better accuracy. Each decision tree in the ensemble is trained on a random subset of the training data, and is optimized to minimize a loss function that measures the difference between the predicted values and the true values. The algorithm then combines the predictions of all the trees in the ensemble to produce a final prediction.


##### Hyperparameters :
> `clf__max_depth`: The maximum depth of each tree in the forest.

> `clf__n_estimators`: The number of trees in the forest.

> `clf__learning_rate`: The learning rate of the boosting process.

In [None]:
from xgboost import XGBClassifier
name='xgboost'
param_dist = {
          **tfidf_param_dist,
          'classifier__max_depth': [5, 10, 20],           # Maximum depth of each tree
          'classifier__n_estimators': [50, 100, 200],     # Number of trees in the forest
          'classifier__learning_rate': [0.01, 0.1, 0.5]   # Learning rate of boosting process

}
xgboost_Classifier,xgboost_df = search_params(XGBClassifier(), X_train, y_train,param_dist=param_dist,name=name)
xgboost_df_metrics = compute_metrics(xgboost_Classifier,X_test, y_test,name = name)

xgboost_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

### Logistic Regression

> Logistic regression works by modeling the probability of each class given the input features, and then predicting the class with the highest probability.


##### Hyperparameters :
> The `clf__C` parameter in the parameter grid refers to the regularization strength

> The `clf__penalty` parameter in the parameter grid refers to the type of regularization 

In [None]:
from sklearn.linear_model import LogisticRegression
name='LogisticRegression'
param_dist = {
          **tfidf_param_dist,
             'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__max_iter':[400,600,800]
}

LogisticRegression_Classifier,LogisticRegression_df = search_params(LogisticRegression(), X_train, y_train,param_dist=param_dist,name=name)
LogisticRegression_df_metrics = compute_metrics(LogisticRegression_Classifier,X_test, y_test,name = name)

LogisticRegression_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head()

### KNeighborsClassifier

> KNN is a type of instance-based learning, where the algorithm memorizes the training set and uses it to classify new instances based on their similarity to the existing instances.


##### Hyperparameters :
> `clf__n_neighbors`: depends on the nature of the dataset, the number of features, and the number of samples. Typically, a larger number of neighbors results in smoother decision boundaries and fewer misclassifications, but may result in overfitting. A smaller number of neighbors may lead to more complex decision boundaries that better fit the training data but may not generalize well to new data. It is generally recommended to choose an odd number of neighbors to avoid ties.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
name='KNeighborsClassifier'

# Define the hyperparameters to tune
param_dist = {
          **tfidf_param_dist,
           'classifier__n_neighbors': [5, 10, 15]
}
KNeighbors_Classifier,KNeighbors_df = search_params(KNeighborsClassifier(), X_train, y_train,param_dist=param_dist,name=name)
KNeighbors_df_metrics = compute_metrics(KNeighbors_Classifier,X_test, y_test,name = name)

KNeighbors_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df.head(len(CLASSIFIERS_METRICS))

### AdaBoost (Adaptive Boosting)

> is an ensemble learning method that combines multiple weak classifiers to form a stronger classifier. In AdaBoost, each weak classifier is trained on a different subset of the training data, and the algorithm iteratively assigns higher weights to the samples that are misclassified by the previous weak classifiers. The final prediction is a weighted combination of the predictions of all the weak classifiers.


##### Hyperparameters :
> `classifier__n_estimators`: The number of estimators, or weak learners, to be used in the AdaBoost algorithm. We try with 50, 100, and 200 estimators.

> `classifier__learning_rate`: The learning rate shrinks the contribution of each classifier by the given amount, and it can have an effect on the performance of the algorithm. We try with 0.1, 0.5, and 1.0.

> `classifier__algorithm`: The boosting algorithm to use, either SAMME or SAMME.R.

> `classifier__base_estimator`: The base estimator is the algorithm used for building the weak learners. We try with two different base estimators: Logistic Regression and LinearSVC.

In [None]:
# let us sort the table based on the best f1-score
classifiers_df.sort_values('f1_weighted', ascending=False)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
name='AdaBoost'

# Define the hyperparameters to tune
param_dist = {
          **tfidf_param_dist,
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.1, 0.5, 1.0],
    'classifier__algorithm': ['SAMME', 'SAMME.R'],
    'classifier__estimator': [LogisticRegression(penalty='l2',max_iter=600,C=1), LinearSVC(loss='squared_hinge'	,C=0.1)],
    }

AdaBoost_Classifier,AdaBoost_df = search_params(AdaBoostClassifier(), X_train, y_train,param_dist=param_dist,name=name)
AdaBoost_df_metrics = compute_metrics(AdaBoost_Classifier,X_test, y_test,name = name)

AdaBoost_df.head()

In [None]:
classifiers_df = pd.DataFrame(CLASSIFIERS_METRICS).set_index('name')
classifiers_df = classifiers_df.sort_values('f1_weighted', ascending=False)
classifiers_df.to_csv(os.path.join(ML_MODEL_DIR,'comparison.csv'))
classifiers_df

In [None]:
# combining best params into 1 dataframe

import glob
import pandas as pd

# Find all files with the extension .pkl
file_list = glob.glob(ML_MODEL_DIR+'*.pkl')

# Read each file as a separate dataframe
dfs = []
for file in file_list:
    df = pd.read_pickle(file)
    dfs.append(df)

# Concatenate all the dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# save combined best parameters of each model dataframe
combined_df.to_csv(os.path.join(ML_MODEL_DIR,'best_params.csv'))

In [None]:
combined_df