In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import issparse

# Load data
df = pd.read_csv('text_preprocessing/final_dataset.csv')

In [3]:
# Convert sentiment labels to binary
def convert_label_sentimen(label_sentimen):
    return 1 if label_sentimen == "positive" else 0

df['label_sentimen'] = df['Sentiment'].apply(convert_label_sentimen)
df.drop(df.columns[[0]], axis=1, inplace=True)

In [4]:
X= df['tweet_tokens_stemmed'] 
y= df['label_sentimen']
# k-fold cross-validation
kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

In [5]:
class NaiveBayes:
    def __init__(self, model_class):
        self.model_class = model_class
        self.clf = None
    
    def fit(self, X_train, y_train, selected_features):
        self.clf = self.model_class()
        if issparse(X_train):
            X_train = X_train.toarray()
        self.clf.fit(X_train[:, selected_features], y_train)
    
    def predict(self, X_test, selected_features):
        if self.clf is None:
            raise ValueError("Classifier not fitted. Please call 'fit' method first.")
        if issparse(X_test):
            X_test = X_test.toarray()
        return self.clf.predict(X_test[:, selected_features])

In [6]:
# Function to evaluate features using selected features and classifier
def evaluate_features(X_train, y_train, selected_features, classifier):
    clf = classifier  # Initialize classifier
    cv_results = cross_val_score(clf.clf, X_train[:, selected_features], y_train, cv=kfold, scoring='accuracy')
    return cv_results.mean()

# Function to perform PSO feature selection
def pso_feature_selection(X_train, y_train, n_particles, max_iter, inertia, global_weight, local_weight, model_class):
    num_samples, num_features = X_train.shape
    bounds = [0, 1]  # Assuming the feature selection is binary (selected or not)
    # Initialize PSO
    num_particles = n_particles
    dimensions = num_features
    particles = np.random.rand(num_particles, dimensions)
    velocities = np.random.rand(num_particles, dimensions)
    best_positions = particles.copy()
    best_scores = np.zeros(num_particles)

    global_best_position = np.zeros(dimensions)
    global_best_score = 0

    for iteration in range(max_iter):
        for particle in range(num_particles):
            # Update particle's velocity
            r1 = np.random.rand(dimensions)
            r2 = np.random.rand(dimensions)
            velocities[particle] = (inertia * velocities[particle] +
                                   global_weight * r1 * (best_positions[particle] - particles[particle]) +
                                   local_weight * r2 * (global_best_position - particles[particle]))

            # Update particle's position
            particles[particle] += velocities[particle]
            particles[particle] = np.clip(particles[particle], bounds[0], bounds[1])

            # Evaluate fitness (accuracy) of current particle
            selected_features = particles[particle] > 0.5
            nb = NaiveBayes(model_class)
            nb.fit(X_train, y_train, selected_features)
            accuracy = evaluate_features(X_train, y_train, selected_features, nb)

            # Update particle's best position and score
            if accuracy > best_scores[particle]:
                best_scores[particle] = accuracy
                best_positions[particle] = particles[particle].copy()

            # Update global best position and score
            if accuracy > global_best_score:
                global_best_score = accuracy
                global_best_position = particles[particle].copy()

    return global_best_position > 0.5

In [6]:
# Define parameters for PSO optimization
population_size_range = [5 * (2 ** i) for i in range(5)]  
inertia = 0.6
global_weight = 0.3
local_weight = 0.6
max_iter = 100

results = []

for n_particles in population_size_range:
    for model_class in [MultinomialNB, BernoulliNB, GaussianNB]:
        accuracy_scores = []
        precision_scores = []
        recall_scores = []

        for train_idx, test_idx in kfold.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            tfidf_model = TfidfVectorizer(smooth_idf=False)
            X_train_tfidf = tfidf_model.fit_transform(X_train)
            X_test_tfidf = tfidf_model.transform(X_test)

            X_train_dense = X_train_tfidf.toarray()
            selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, max_iter, inertia, global_weight, local_weight, model_class)

            if not np.any(selected_features):
                selected_features[0] = True

            nb = NaiveBayes(model_class)
            nb.fit(X_train_tfidf, y_train, selected_features)
            y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))

        fold_results = {
            "Model": model_class.__name__,
            "Population Size": n_particles,
            "Average Accuracy": np.mean(accuracy_scores),
            "Average Precision": np.mean(precision_scores),
            "Average Recall": np.mean(recall_scores)
        }

        results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Population Size", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))

        Model  Population Size  Average Accuracy  Average Precision  Average Recall
MultinomialNB                5            0.7825           0.747522           0.860
  BernoulliNB                5            0.7475           0.691261           0.920
   GaussianNB                5            0.7225           0.681554           0.850
MultinomialNB               10            0.7750           0.742389           0.850
  BernoulliNB               10            0.7450           0.689966           0.915
   GaussianNB               10            0.7200           0.678980           0.840
MultinomialNB               20            0.7800           0.748100           0.850
  BernoulliNB               20            0.7525           0.693773           0.925
   GaussianNB               20            0.7225           0.684991           0.835
MultinomialNB               40            0.7750           0.748459           0.835
  BernoulliNB               40            0.7375           0.692717         

In [7]:
# Define parameters for PSO optimization
n_particles = 20
inertia = 0.6
global_best_weight_range = np.linspace(0.1, 1, num=10)
local_weight = 0.6
max_iter = 100

results = []  # Initialize results list

# Iterate over parameter ranges
for global_weight in global_best_weight_range:
    for model_class in [MultinomialNB, BernoulliNB, GaussianNB]:
        accuracy_scores = []
        precision_scores = []
        recall_scores = []

        for train_idx, test_idx in kfold.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            tfidf_model = TfidfVectorizer(smooth_idf=False)
            X_train_tfidf = tfidf_model.fit_transform(X_train)
            X_test_tfidf = tfidf_model.transform(X_test)

            X_train_dense = X_train_tfidf.toarray()
            selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, max_iter, inertia, global_weight, local_weight, model_class)

            if not np.any(selected_features):
                selected_features[0] = True

            nb = NaiveBayes(model_class)
            nb.fit(X_train_tfidf, y_train, selected_features)
            y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))

        fold_results = {
            "Model": model_class.__name__,
            "Global Best Weight": global_weight,
            "Average Accuracy": np.mean(accuracy_scores),
            "Average Precision": np.mean(precision_scores),
            "Average Recall": np.mean(recall_scores)
        }

        results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Global Best Weight", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))

        Model  Global Best Weight  Average Accuracy  Average Precision  Average Recall
MultinomialNB                 0.1            0.7725           0.743049           0.835
  BernoulliNB                 0.1            0.7475           0.690869           0.915
   GaussianNB                 0.1            0.7300           0.686553           0.860
MultinomialNB                 0.2            0.7875           0.755436           0.855
  BernoulliNB                 0.2            0.7475           0.693439           0.910
   GaussianNB                 0.2            0.7250           0.688005           0.835
MultinomialNB                 0.3            0.7875           0.758650           0.855
  BernoulliNB                 0.3            0.7450           0.691994           0.910
   GaussianNB                 0.3            0.7225           0.684326           0.840
MultinomialNB                 0.4            0.7775           0.747767           0.845
  BernoulliNB                 0.4          

In [8]:
# Define parameters for PSO optimization
n_particles = 20
inertia_weight_range = np.linspace(0.1, 1, num=10)
global_weight = 0.3
local_weight = 0.6
max_iter = 100

results = []  # Initialize results list

# Iterate over parameter ranges
for inertia in inertia_weight_range:
    for model_class in [MultinomialNB, BernoulliNB, GaussianNB]:
        accuracy_scores = []
        precision_scores = []
        recall_scores = []

        for train_idx, test_idx in kfold.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            tfidf_model = TfidfVectorizer(smooth_idf=False)
            X_train_tfidf = tfidf_model.fit_transform(X_train)
            X_test_tfidf = tfidf_model.transform(X_test)

            X_train_dense = X_train_tfidf.toarray()
            selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, max_iter, inertia, global_weight, local_weight, model_class)

            if not np.any(selected_features):
                selected_features[0] = True

            nb = NaiveBayes(model_class)
            nb.fit(X_train_tfidf, y_train, selected_features)
            y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))

        fold_results = {
            "Model": model_class.__name__,
            "Inertia Weight": inertia,
            "Average Accuracy": np.mean(accuracy_scores),
            "Average Precision": np.mean(precision_scores),
            "Average Recall": np.mean(recall_scores)
        }

        results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Inertia Weight", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))

        Model  Inertia Weight  Average Accuracy  Average Precision  Average Recall
MultinomialNB             0.1            0.8025           0.794523           0.825
  BernoulliNB             0.1            0.7800           0.791242           0.820
   GaussianNB             0.1            0.7000           0.650731           0.875
MultinomialNB             0.2            0.7875           0.759354           0.845
  BernoulliNB             0.2            0.7550           0.703039           0.910
   GaussianNB             0.2            0.7200           0.679857           0.855
MultinomialNB             0.3            0.7900           0.757187           0.860
  BernoulliNB             0.3            0.7750           0.735842           0.900
   GaussianNB             0.3            0.7200           0.679091           0.850
MultinomialNB             0.4            0.7850           0.750946           0.860
  BernoulliNB             0.4            0.7625           0.714111           0.915
   G

In [9]:
# Define parameters for PSO optimization
n_particles = 20
inertia_weight = 0.6
global_weight = 0.3
local_best_weight_range = np.linspace(0.1, 1, num=10)
max_iter = 100

results = []  # Initialize results list

# Iterate over parameter ranges
for local_weight in local_best_weight_range:
    for model_class in [MultinomialNB, BernoulliNB, GaussianNB]:
        accuracy_scores = []
        precision_scores = []
        recall_scores = []

        for train_idx, test_idx in kfold.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            tfidf_model = TfidfVectorizer(smooth_idf=False)
            X_train_tfidf = tfidf_model.fit_transform(X_train)
            X_test_tfidf = tfidf_model.transform(X_test)

            X_train_dense = X_train_tfidf.toarray()
            selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, max_iter, inertia, global_weight, local_weight, model_class)

            if not np.any(selected_features):
                selected_features[0] = True

            nb = NaiveBayes(model_class)
            nb.fit(X_train_tfidf, y_train, selected_features)
            y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))

        fold_results = {
            "Model": model_class.__name__,
            "Local Best Weight": local_weight,
            "Average Accuracy": np.mean(accuracy_scores),
            "Average Precision": np.mean(precision_scores),
            "Average Recall": np.mean(recall_scores)
        }

        results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Local Best Weight", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))

        Model  Local Best Weight  Average Accuracy  Average Precision  Average Recall
MultinomialNB                0.1            0.7800           0.743313           0.865
  BernoulliNB                0.1            0.7450           0.688284           0.915
   GaussianNB                0.1            0.7225           0.684843           0.835
MultinomialNB                0.2            0.7950           0.764087           0.855
  BernoulliNB                0.2            0.7500           0.691984           0.920
   GaussianNB                0.2            0.7125           0.676587           0.830
MultinomialNB                0.3            0.7875           0.758206           0.850
  BernoulliNB                0.3            0.7550           0.697442           0.920
   GaussianNB                0.3            0.7150           0.675710           0.840
MultinomialNB                0.4            0.7800           0.747763           0.850
  BernoulliNB                0.4            0.7500    