<a href="https://colab.research.google.com/github/myidjayesh/express/blob/master/Hybrid_Optimization_with_Algorithm_Accuracies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  Install Required Packages
!pip install nltk scikit-learn pandas indic-nlp-library

#  Imports
import numpy as np  # math function
import pandas as pd  # table format
from sklearn.feature_extraction.text import TfidfVectorizer  # Text to binary number
from sklearn.model_selection import StratifiedShuffleSplit  # split into training and testing
from sklearn.metrics import accuracy_score, classification_report  #
from sklearn.naive_bayes import MultinomialNB  # EMotion Guessing algo
from sklearn.utils import resample
import re

import nltk
nltk.download('punkt')
from indicnlp.tokenize import indic_tokenize

#  Load Dataset
data = pd.read_csv('hindi_emotion_dataset261.csv')

#  Filter classes with <2 samples
emotion_counts = data['emotion'].value_counts()  # Count emotion in dataset
valid_classes = emotion_counts[emotion_counts >= 2].index  #
data = data[data['emotion'].isin(valid_classes)]  # filters the dataset to keep only those emotion classes that appear at least twice, removing rare classes that occur only once.

#  Hindi Stopwords
stop_words = set([
    'के', 'का', 'की', 'से', 'को', 'में', 'पर', 'और', 'भी', 'है', 'यह', 'थे', 'था', 'हूं', 'हो', 'गया', 'रहा', 'एक', 'तो', 'नीचे', 'मैं', 'मुझे', 'मेरा', 'स्वयं', 'हम', 'हमारा', 'हमारा ', 'वह', 'उसका', 'उसका', 'स्वयं', 'वह', 'उसकी', 'उसका', 'स्वयं ', 'यह', "यह है", 'इसका', 'स्वयं', 'वे', 'उन्हें', 'उनका', 'उनका',
    'स्वयं', 'क्या', 'जो', 'कौन', 'जिसे', 'यह', 'वह', 'ये', 'वे', 'हैं', 'हैं', 'लिया', 'था', 'थे', 'होना', 'किया जाना', 'है', 'है', 'था', 'होने', 'करना', 'करता है', 'किया', 'कर रहा है', '!',
    'एक', 'एक', 'वह', 'और', 'लेकिन', 'अगर', 'या', 'क्योंकि', 'जैसा', 'जब तक', 'जबकि', 'का', 'पर', 'द्वारा', 'के लिए', 'के साथ', 'के बारे में', 'खिलाफ', 'बीच में', 'में', 'के माध्यम से', 'दौरान', 'पहले', 'बाद में', 'ऊपर', 'नीचे', 'को',
    'से', 'ऊपर', 'नीचे', 'अंदर', 'बाहर', 'चालू', 'बंद', 'ऊपर', 'नीचे', 'फिर', 'आगे', 'फिर', 'एक बार', 'यहाँ', 'वहाँ', 'कब', 'कहाँ', 'क्यों', 'कैसे', 'सभी', 'कोई भी', 'दोनों', 'प्रत्येक', 'कुछ', 'अधिक', 'अधिकतर',
    'अन्य', 'कुछ', 'ऐसा', 'नहीं', 'न ही', 'नहीं', 'केवल', 'स्वयं का', 'वही', 'इसलिए', 'से', 'भी', 'बहुत', 'हूं', 'एस', 'टी', 'कर सकते हैं', 'करेंगे', 'बस', 'रहा', 'नहीं करें', 'करना चाहिए', 'होना चाहिए था', 'अब', 'डी', 'करूंगा', 'एम', 'ओ', 'री', 'वे', 'वाई', 'ऐन', 'नहीं हैं', 'नहीं कर सके', 'नहीं कर सके', 'नहीं किया', 'नहीं किया', 'नहीं करता', 'नहीं करता', 'नहीं था', "नहीं है", 'माँ', 'हो सकता है',
    "हो सकता है नहीं", 'चाहिए', "नहीं चाहिए", 'ज़रूरत नहीं', "ज़रूरत नहीं", 'शान', "नहीं करना चाहिए", 'नहीं करना चाहिए', 'नहीं था', 'नहीं था', 'नहीं थे', 'नहीं थे', 'जीता', "नहीं करेगा", 'होगा', "नहीं होगा", 'और', 'का', 'के', 'की', 'है', 'हूँ', 'तो', 'यह', 'जो', 'भी', 'में', 'पर', '।',
    'था', 'थे', 'वह', 'वो', 'इस', 'नहीं', 'कि', 'जो', 'के', 'लिए', 'को', 'कर', 'करने', 'तक', "के", "की", "से", "है", "में", "हो", "को", "इस", "कर", "या", "हैं"
])


#  Preprocessing
def clean_text(text):  # defines a function called clean_text that takes a parameter text--(function and parameter)
    text = str(text).lower()  # Converts the input to a string (in case it’s not), and then changes all letters to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Removes all punctuation or special characters using a regular expression, keeping only letters, numbers, and whitespace
    return text  # It return the cleaned text


def preprocess_text(text):  # Defines a function named preprocess_text that takes one input: text
    text = clean_text(text)  # it cleans the text by converting it to lowercase and removing special characters (using the clean_text function you saw earlier)
    tokens = indic_tokenize.trivial_tokenize(text,
                                             lang='hi')  # it tokenizes the text into words (splits it) using a simple tokenizer for Hindi (trivial_tokenize from indic_tokenize).
    tokens = [t for t in tokens if
              t not in stop_words]  # Filters out stop words (common words like "है", "मैं", etc., which don’t carry much meaning) from the token list.
    return ' '.join(tokens)  # Joins the remaining tokens (words) back into a single string with spaces and returns it


data['text'] = data['text'].apply(preprocess_text)  # Cleans and processes all text data in the 'text' column of the dataset.


#  Class Balancing
# This function balances the dataset by upsampling smaller emotion classes so that all classes have the same number of samples
def balance_data(df):  # Defines a function named balance_data that takes a DataFrame df as input
    classes = df['emotion'].value_counts().index  # Gets a list of all unique emotion classes in the dataset
    max_size = df['emotion'].value_counts().max()  # Finds the count of the most frequent emotion class (the largest class size).
    lst = [df]  # Creates a list with the original DataFrame in it, which will later be added to
    for cls in classes:  # Starts a loop through each emotion class.
        df_class = df[df['emotion'] == cls]  # Selects all rows from the DataFrame that belong to the current emotion class
        if len(df_class) < max_size:  # Checks if the current class has fewer samples than the most frequent one
            lst.append(resample(df_class, replace=True, n_samples=max_size - len(df_class),
                               random_state=42))  # If yes, it upsamples this class (randomly duplicates samples) to match the size of the largest class
    return pd.concat(lst)  # Combines the original data with the newly upsampled data and returns the balanced dataset


data = balance_data(data)  # Calls the function on your data and replaces it with the balanced version, where all emotion classes now have equal size.


#  Train/Test Split
# This code splits the dataset into training and testing sets (80%-20%) while keeping the emotion class distribution balanced in both sets.
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
                              random_state=42)  # Creates a Stratified Shuffle Split object that will split the data once (n_splits=1) into 80% training and 20% testing, while keeping the proportion of each emotion class the same in both sets
for train_index, test_index in sss.split(data['text'],
                                        data['emotion']):  # Performs the split. it returns the indices for training and testing, ensuring that the emotion labels are evenly distributed
    X_train, X_test = data['text'].iloc[
        train_index], data['text'].iloc[
        test_index]  # Uses the indices to get the training and testing text data
    y_train, y_test = data['emotion'].iloc[
        train_index], data['emotion'].iloc[
        test_index]  # Uses the same indices to get the corresponding emotion labels for training and testing


#  Objective Function (alpha, max_feat, ngram_upper) This function trains a Naive Bayes model with given parameters and returns the negative accuracy, so that an optimizer can find the best parameters by minimizing the output
def objective_function(params):  # Defines a function called objective_function that takes a list or tuple of parameters: params
    alpha, max_feat, ngram_upper = params  # Unpacks the parameters into three variables: alpha (smoothing for Naive Bayes), max_feat (number of max features for TF-IDF), ngram_upper (upper limit for n-grams).
    vectorizer = TfidfVectorizer(max_features=int(max_feat),
                                 ngram_range=(1, int(ngram_upper)))  # Creates a TF-IDF vectorizer using the given max_feat and ngram_upper
    X_train_tfidf = vectorizer.fit_transform(
        X_train)  # Fits the TF-IDF vectorizer on the training data and transforms it into a TF-IDF matrix
    X_test_tfidf = vectorizer.transform(
        X_test)  # Transforms the test data using the same TF-IDF vectorizer (without refitting)
    clf = MultinomialNB(
        alpha=alpha)  # Creates a Multinomial Naive Bayes classifier with the given alpha value
    clf.fit(X_train_tfidf, y_train)  # trains the classifier on the training TF-IDF data
    y_pred = clf.predict(
        X_test_tfidf)  # Predicts emotion labels for the test TF-IDF data
    return -accuracy_score(y_test,
                             y_pred)  # Returns the negative accuracy (so that optimization algorithms can minimize this value — minimizing negative accuracy is the same as maximizing accuracy)


#  PSO
class PSO:
    def __init__(self, num_particles, num_iterations,
                 bounds):  # Initializes the PSO with the number of particles, number of iterations, and search space bounds
        self.num_particles = num_particles  # Stores how many particles (solutions) will be used in the swarm
        self.num_iterations = num_iterations  # Stores how many iterations the algorithm will run
        self.bounds = bounds  # Stores the bounds for each parameter (as a list of tuples, e.g., [(0.1, 1), (100, 1000)])
        self.particles = np.random.uniform([b[0] for b in bounds], [b[1] for b in bounds],
                                           size=(num_particles,
                                                 len(bounds)))  # Initializes each particle's position randomly within the given bounds. Each particle represents a potential solution
        self.velocities = np.zeros_like(
            self.particles)  # Initializes the velocity of each particle as a zero vector (same shape as particles)
        self.best_positions = self.particles.copy()  # Stores the best position found so far by each particle (initially, it's just the starting position)
        self.best_fitness = np.inf * np.ones(
            num_particles)  # Initializes the best fitness (error) of each particle to infinity, meaning no good solution has been found yet
        self.accuracy = 0.0  # Initialize accuracy

    def optimize(self):
        for _ in range(self.num_iterations):  # Repeats the optimization process for a fixed number of iterations
            for i in range(self.num_particles):  # Loops through each particle in the swarm
                fitness = objective_function(
                    self.particles[i])  # Evaluates the current particle’s position (solution) using the objective_function
                if fitness < self.best_fitness[i]:  # If the current fitness is better than the particle's best so far
                    self.best_fitness[i] = fitness
                    self.best_positions[i] = self.particles[
                        i]  # Update that particle’s best score and best-known position.
                g_best = self.best_positions[
                    np.argmin(self.best_fitness)]  # Finds the global best position from all particles (the one with the lowest fitness score)
                self.velocities[i] += 0.5 * (self.best_positions[i] - self.particles[i]) + 0.5 * (
                            g_best - self.particles[i])  # Updates the particle’s velocity based on: Its own best-known position, The swarm's global best position with weights of 0.5 (you can think of these as how much each influence the movement).
                self.particles[i] += self.velocities[
                    i]  # Moves the particle based on the new velocity
                self.particles[i] = np.clip(self.particles[i], [b[0] for b in self.bounds],
                                           [b[1] for b in
                                            self.bounds])  # Ensures the new position stays within the allowed bounds
        best_params = self.best_positions[np.argmin(self.best_fitness)]
        vectorizer = TfidfVectorizer(max_features=int(best_params[1]),
                                     ngram_range=(1, int(best_params[2])))
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        clf = MultinomialNB(alpha=best_params[0])
        clf.fit(X_train_tfidf, y_train)
        y_pred = clf.predict(X_test_tfidf)
        self.accuracy = accuracy_score(y_test, y_pred)
        return best_params  # After all iterations, returns the best solution found by the swarm..


#  Firefly
class Firefly:
    def __init__(self, num_fireflies, num_iterations,
                 bounds):  # Initializes the Firefly algorithm with : num_fireflies- how many fireflies (solutions) to use, num_iterations: how many times the algorithm will update, bounds: the allowed range for each parameter.
        self.num_fireflies = num_fireflies
        self.num_iterations = num_iterations
        self.bounds = bounds  # Stores the inputs as attributes of the class
        self.fireflies = np.random.uniform([b[0] for b in bounds], [b[1] for b in bounds],
                                           size=(num_fireflies,
                                                 len(bounds)))  # Initializes the positions of all fireflies randomly within the given bounds. Each firefly is a candidate solution
        self.light_intensities = np.inf * np.ones(
            num_fireflies)  # Initializes each firefly’s light intensity (fitness value) to infinity, meaning no good solution has been found yet
        self.accuracy = 0.0  # Initialize accuracy

    def optimize(self):
        for _ in range(self.num_iterations):  # Loops for a fixed number of iterations to update fireflies’ positions
            for i in range(self.num_fireflies):  # Loops over each firefly i
                fitness = objective_function(
                    self.fireflies[i])  # Calculates the fitness (or brightness) of firefly i
                if fitness < self.light_intensities[i]:
                    self.light_intensities[i] = fitness  # If this new fitness is better (lower), update the firefly’s brightness
                for j in range(self.num_fireflies):  # Now compare this firefly i to all others j
                    if self.light_intensities[j] < self.light_intensities[i]:  # If firefly j is brighter (better solution) than firefly i
                        self.fireflies[i] += 0.2 * (self.fireflies[j] - self.fireflies[i]) + 0.2 * np.random.uniform(-1, 1,
                                                                                                                   size=len(
                                                                                                                       self.bounds))  # Firefly i moves toward firefly j, plus a little random movement (to explore new areas)
                        self.fireflies[i] = np.clip(self.fireflies[i], [b[0] for b in self.bounds],
                                                   [b[1] for b in
                                                    self.bounds])  # Keeps firefly i's new position within the allowed bounds
        best_params = self.fireflies[np.argmin(self.light_intensities)]
        vectorizer = TfidfVectorizer(max_features=int(best_params[1]),
                                     ngram_range=(1, int(best_params[2])))
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        clf = MultinomialNB(alpha=best_params[0])
        clf.fit(X_train_tfidf, y_train)
        y_pred = clf.predict(X_test_tfidf)
        self.accuracy = accuracy_score(y_test, y_pred)
        return best_params  # After all iterations, returns the best firefly (the one with the lowest fitness value)


# 🥚 Cuckoo Search
class Cuckoo:
    def __init__(self, num_cuckoos, num_iterations,
                 bounds):  # Initializes the Cuckoo Search with: num_cuckoos - number of cuckoo birds (candidate solutions),num_iterations: how many times the algorithm will run, bounds: the allowed range for each parameter.
        self.num_cuckoos = num_cuckoos
        self.num_iterations = num_iterations
        self.bounds = bounds  # Stores the given inputs as class attributes
        self.cuckoos = np.random.uniform([b[0] for b in bounds], [b[1] for b in bounds],
                                         size=(num_cuckoos,
                                               len(bounds)))  # Randomly initializes the position of each cuckoo within the specified bounds. Each cuckoo represents a potential solution
        self.fitness = np.inf * np.ones(
            num_cuckoos)  # Initializes the fitness (error score) for each cuckoo as infinity, meaning no good solution is known yet
        self.accuracy = 0.0  # Initialize accuracy

    def optimize(self):
        for _ in range(self.num_iterations):  # Repeats the search process for a set number of iterations
            for i in range(self.num_cuckoos):  # Loops over each cuckoo (candidate solution)
                fitness = objective_function(
                    self.cuckoos[i])  # Calculates how good the current cuckoo’s solution is (lower is better)
                if fitness < self.fitness[i]:
                    self.fitness[i] = fitness  # If this new fitness is better than what was recorded before, update it
                j = np.random.randint(0, self.num_cuckoos)  # Randomly pick another cuckoo j
                if self.fitness[j] < self.fitness[i]:  # If cuckoo j has a better solution (lower fitness) than cuckoo i
                    self.cuckoos[i] += 0.2 * (self.cuckoos[j] - self.cuckoos[i]) + 0.2 * np.random.uniform(-1, 1,
                                                                                                               size=len(
                                                                                                                   self.bounds))  # Move cuckoo i a bit toward cuckoo j, and add a little randomness to explore
                    self.cuckoos[i] = np.clip(self.cuckoos[i], [b[0] for b in self.bounds],
                                             [b[1] for b in
                                              self.bounds])  # Make sure the new position stays within the allowed parameter range
        best_params = self.cuckoos[np.argmin(self.fitness)]
        vectorizer = TfidfVectorizer(max_features=int(best_params[1]),
                                     ngram_range=(1, int(best_params[2])))
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        clf = MultinomialNB(alpha=best_params[0])
        clf.fit(X_train_tfidf, y_train)
        y_pred = clf.predict(X_test_tfidf)
        self.accuracy = accuracy_score(y_test, y_pred)
        return best_params  # After all iterations, return the best solution found (the cuckoo with the lowest fitness)


#  Hybrid Optimization
class HybridOptimization:
    def __init__(self, num_particles, num_fireflies, num_cuckoos, num_iterations, bounds):
        self.pso = PSO(num_particles, num_iterations, bounds)
        self.fa = Firefly(num_fireflies, num_iterations, bounds)
        self.cs = Cuckoo(num_cuckoos, num_iterations,
                         bounds)  # Initializes each algorithm with the same number of iterations and parameter bounds

    def optimize(self):  # Starts the hybrid optimization process
        pso_best = self.pso.optimize()
        fa_best = self.fa.optimize()
        cs_best = self.cs.optimize()  # Runs each algorithms separately and saves their best result.
        best_fitness = np.inf
        best_position = None  # Initialize variable to track the overall best solution
        best_accuracy = 0.0
        pso_accuracy = self.pso.accuracy
        fa_accuracy = self.fa.accuracy
        cs_accuracy = self.cs.accuracy

        for i, position in enumerate([pso_best, fa_best, cs_best]):
            fitness = objective_function(position)
            if fitness < best_fitness:
                best_fitness = fitness
                best_position = position
                if i == 0:
                    best_accuracy = pso_accuracy
                elif i == 1:
                    best_accuracy = fa_accuracy
                else:
                    best_accuracy = cs_accuracy
                    # Compares the three result and keep the one with hte lowest fitness (best performance)
        print(f"PSO Accuracy: {pso_accuracy:.4f}")
        print(f"Firefly Accuracy: {fa_accuracy:.4f}")
        print(f"Cuckoo Accuracy: {cs_accuracy:.4f}")
        return best_position  # Returns the best solution found among the three algorithms


#  Run Optimization
bounds = [(0.001, 1.0), (1000, 8000),
          (1, 2)]  # alpha  from 0.001 to 1.0 (used in MultinomialNB), max_features from 1000 to 8000 (for TF-IDF vectorizer), ngram_uppe reither 1 or 2 (for unigrams or bigrams)
"""Creates an object of the HybridOptimization class with:
5 particles for PSO,
5 fireflies for Firefly Algorithm,
5 cuckoos for Cuckoo Search,
30 iterations,
The parameter bounds defined above."""
optimizer = HybridOptimization(5, 5, 5, 30, bounds)
best_position = optimizer.optimize()  # Runs the hybrid optimization process and gets the best set of parameters found
best_alpha, best_max_feat, best_ngram_upper = best_position  # Splits the best solution into individual variables
print("🔧 Best Parameters:")
print(f"  Alpha: {best_alpha:.4f}, Max Features: {int(best_max_feat)}, N-gram Upper: {int(best_ngram_upper)}")

#  Final Model with Best Params
# This code builds and trains the final emotion classifier using the best hyperparameters found by your hybrid optimize
vectorizer = TfidfVectorizer(max_features=int(best_max_feat),
                             ngram_range=(1, int(best_ngram_upper)))
X_train_tfidf = vectorizer.fit_transform(
    X_train)  # Learns the vocabulary from the training text and converts X_train into a TF-IDF matrix
X_test_tfidf = vectorizer.transform(
    X_test)  # Converts X_test into a TF-IDF matrix using the same vocabulary as training

final_clf = MultinomialNB(alpha=best_alpha)  # Creates a Multinomial Naive Bayes classifier using the optimized alpha
final_clf.fit(X_train_tfidf, y_train)  # Trains the classifier on the TF-IDF vectors and their corresponding emotions
y_pred = final_clf.predict(
    X_test_tfidf)  # Uses the trained classifier to predict emotions for the test data

#  Evaluation
print(" Final Accuracy:", accuracy_score(y_test, y_pred))  # Prints the overall accuracy of your final model — how many predictions were correct out of all test samples
"""Prints a detailed classification report, which includes:
Precision (how many predicted emotions were correct),
Recall (how many actual emotions were correctly found),
F1-score (harmonic mean of precision and recall),
Support (number of true samples for each class)."""
print("\n Classification Report:\n", classification_report(y_test, y_pred))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


PSO Accuracy: 0.7684
Firefly Accuracy: 0.7368
Cuckoo Accuracy: 0.7895
🔧 Best Parameters:
  Alpha: 0.2985, Max Features: 4579, N-gram Upper: 2
 Final Accuracy: 0.7894736842105263

 Classification Report:
               precision    recall  f1-score   support

       angry       0.50      0.20      0.29         5
  confidence       1.00      1.00      1.00         5
    disguest       1.00      1.00      1.00         5
        envy       0.80      0.80      0.80         5
        fear       0.60      0.60      0.60         5
   gratitude       1.00      1.00      1.00         5
       happy       0.43      0.60      0.50         5
        hope       0.83      1.00      0.91         5
    jealousy       0.67      0.80      0.73         5
  loneliness       1.00      0.80      0.89         5
        love       1.00      1.00      1.00         5
 nervousness       0.80      0.80      0.80         5
       pride       0.67      0.40      0.50         5
      regret       1.00      0.80      