In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import issparse

# Load data
df = pd.read_csv('text_preprocessing/final_dataset.csv')

In [3]:
# Convert sentiment labels to binary
def convert_label_sentimen(label_sentimen):
    return 1 if label_sentimen == "positive" else 0

df['label_sentimen'] = df['Sentiment'].apply(convert_label_sentimen)
df.drop(df.columns[[0]], axis=1, inplace=True)

In [4]:
X= df['tweet_tokens_stemmed'] 
y= df['label_sentimen']
# k-fold cross-validation
kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

In [5]:
class NaiveBayes:
    def __init__(self, model_class):
        self.model_class = model_class
        self.clf = None
    
    def fit(self, X_train, y_train, selected_features):
        self.clf = self.model_class()
        if issparse(X_train):
            X_train = X_train.toarray()
        self.clf.fit(X_train[:, selected_features], y_train)
    
    def predict(self, X_test, selected_features):
        if self.clf is None:
            raise ValueError("Classifier not fitted. Please call 'fit' method first.")
        if issparse(X_test):
            X_test = X_test.toarray()
        return self.clf.predict(X_test[:, selected_features])

In [6]:
def evaluate_features(X_train, y_train, selected_features, classifier):
    clf = classifier
    cv_results = cross_val_score(clf.clf, X_train[:, selected_features], y_train, cv=kfold, scoring='accuracy')
    return cv_results.mean()

def pso_feature_selection(X_train, y_train, n_particles, inertia, global_weight, local_weight, model_class, tol=1e-5, patience=10):
    num_samples, num_features = X_train.shape
    bounds = [0, 1]
    
    num_particles = n_particles
    dimensions = num_features
    particles = np.random.rand(num_particles, dimensions)
    velocities = np.random.rand(num_particles, dimensions) * 0.1
    best_positions = particles.copy()
    best_scores = np.zeros(num_particles)

    global_best_position = np.zeros(dimensions)
    global_best_score = 0

    no_improvement_count = 0
    previous_global_best_score = 0

    while no_improvement_count < patience:
        for particle in range(num_particles):
            r1 = np.random.rand(dimensions)
            r2 = np.random.rand(dimensions)
            velocities[particle] = (inertia * velocities[particle] +
                                    global_weight * r1 * (best_positions[particle] - particles[particle]) +
                                    local_weight * r2 * (global_best_position - particles[particle]))

            particles[particle] += velocities[particle]
            particles[particle] = np.clip(particles[particle], bounds[0], bounds[1])

            selected_features = particles[particle] > 0.5
            nb = NaiveBayes(model_class)
            nb.fit(X_train, y_train, selected_features)
            accuracy = evaluate_features(X_train, y_train, selected_features, nb)

            if accuracy > best_scores[particle]:
                best_scores[particle] = accuracy
                best_positions[particle] = particles[particle].copy()

            if accuracy > global_best_score:
                global_best_score = accuracy
                global_best_position = particles[particle].copy()

        if abs(global_best_score - previous_global_best_score) < tol:
            no_improvement_count += 1
        else:
            no_improvement_count = 0
        
        previous_global_best_score = global_best_score

    return global_best_position > 0.5

In [7]:
# Define parameters for PSO optimization
population_size_range = [5 * (2 ** i) for i in range(5)]  # 5, 10, 20
inertia = 0.6
global_weight = 0.3
local_weight = 0.6

results = []

for n_particles in population_size_range:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        tfidf_model = TfidfVectorizer(smooth_idf=False)
        X_train_tfidf = tfidf_model.fit_transform(X_train)
        X_test_tfidf = tfidf_model.transform(X_test)

        X_train_dense = X_train_tfidf.toarray()
        selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, inertia, global_weight, local_weight, MultinomialNB)

        if not np.any(selected_features):
            selected_features[0] = True

        nb = NaiveBayes(MultinomialNB)
        nb.fit(X_train_tfidf, y_train, selected_features)
        y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))

    fold_results = {
        "Population Size": n_particles,
        "Average Accuracy": np.mean(accuracy_scores),
        "Average Precision": np.mean(precision_scores),
        "Average Recall": np.mean(recall_scores)
    }

    results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Population Size", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))
results_df.to_csv("population_size.csv")

 Population Size  Average Accuracy  Average Precision  Average Recall
               5            0.7750           0.770596           0.795
              10            0.7650           0.762314           0.785
              20            0.7675           0.750758           0.810
              40            0.7750           0.762815           0.825
              80            0.7975           0.780576           0.835


In [8]:
# Define parameters for PSO optimization
n_particles = 20
inertia_weight_range = np.linspace(0.1, 1, num=10)
global_weight = 0.3
local_weight = 0.6

results = []

for inertia in inertia_weight_range:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        tfidf_model = TfidfVectorizer(smooth_idf=False)
        X_train_tfidf = tfidf_model.fit_transform(X_train)
        X_test_tfidf = tfidf_model.transform(X_test)

        X_train_dense = X_train_tfidf.toarray()
        selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, inertia, global_weight, local_weight, MultinomialNB)

        if not np.any(selected_features):
            selected_features[0] = True

        nb = NaiveBayes(MultinomialNB)
        nb.fit(X_train_tfidf, y_train, selected_features)
        y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))

    fold_results = {
        "Inertia Weight": inertia,
        "Average Accuracy": np.mean(accuracy_scores),
        "Average Precision": np.mean(precision_scores),
        "Average Recall": np.mean(recall_scores)
    }

    results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Inertia Weight", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))
results_df.to_csv("inertia.csv")

 Inertia Weight  Average Accuracy  Average Precision  Average Recall
            0.1            0.7650           0.784097           0.755
            0.2            0.7725           0.772463           0.780
            0.3            0.7675           0.771691           0.775
            0.4            0.7675           0.765741           0.785
            0.5            0.7775           0.760949           0.825
            0.6            0.7625           0.747992           0.800
            0.7            0.7675           0.753645           0.805
            0.8            0.7800           0.774077           0.800
            0.9            0.7875           0.775102           0.810
            1.0            0.7700           0.752978           0.805


In [9]:
# Define parameters for PSO optimization
n_particles = 20
inertia = 0.8
global_best_weight_range = np.linspace(0.1, 1, num=10)
local_weight = 0.6

results = []

for global_weight in global_best_weight_range:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        tfidf_model = TfidfVectorizer(smooth_idf=False)
        X_train_tfidf = tfidf_model.fit_transform(X_train)
        X_test_tfidf = tfidf_model.transform(X_test)

        X_train_dense = X_train_tfidf.toarray()
        selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, inertia, global_weight, local_weight, MultinomialNB)

        if not np.any(selected_features):
            selected_features[0] = True

        nb = NaiveBayes(MultinomialNB)
        nb.fit(X_train_tfidf, y_train, selected_features)
        y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))

    fold_results = {
        "Global Best Weight": global_weight,
        "Average Accuracy": np.mean(accuracy_scores),
        "Average Precision": np.mean(precision_scores),
        "Average Recall": np.mean(recall_scores)
    }

    results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Global Best Weight", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))
results_df.to_csv("global_weight.csv")

 Global Best Weight  Average Accuracy  Average Precision  Average Recall
                0.1            0.7900           0.760655           0.850
                0.2            0.7925           0.783207           0.825
                0.3            0.7875           0.762616           0.840
                0.4            0.8150           0.796601           0.855
                0.5            0.7850           0.781556           0.805
                0.6            0.8000           0.797363           0.815
                0.7            0.7900           0.793668           0.790
                0.8            0.7800           0.774543           0.800
                0.9            0.7875           0.777814           0.810
                1.0            0.7775           0.767485           0.805


In [10]:
# Define parameters for PSO optimization
n_particles = 20
inertia = 0.8
global_weight = 1.0
local_best_weight_range = np.linspace(0.1, 1, num=10)

results = []

for local_weight in local_best_weight_range:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        tfidf_model = TfidfVectorizer(smooth_idf=False)
        X_train_tfidf = tfidf_model.fit_transform(X_train)
        X_test_tfidf = tfidf_model.transform(X_test)

        X_train_dense = X_train_tfidf.toarray()
        selected_features = pso_feature_selection(X_train_dense, y_train, n_particles, inertia, global_weight, local_weight, MultinomialNB)

        if not np.any(selected_features):
            selected_features[0] = True

        nb = NaiveBayes(MultinomialNB)
        nb.fit(X_train_tfidf, y_train, selected_features)
        y_pred = nb.predict(X_test_tfidf.toarray(), selected_features)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))

    fold_results = {
        "Local Best Weight": local_weight,
        "Average Accuracy": np.mean(accuracy_scores),
        "Average Precision": np.mean(precision_scores),
        "Average Recall": np.mean(recall_scores)
    }

    results.append(fold_results)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Local Best Weight", "Average Accuracy"], ascending=[True, False])

print(results_df.to_string(index=False))
results_df.to_csv("local_weight.csv")

 Local Best Weight  Average Accuracy  Average Precision  Average Recall
               0.1            0.7600           0.757244           0.795
               0.2            0.7650           0.744899           0.810
               0.3            0.7800           0.769116           0.805
               0.4            0.7950           0.779661           0.830
               0.5            0.7875           0.756655           0.850
               0.6            0.7875           0.774852           0.820
               0.7            0.7975           0.792216           0.815
               0.8            0.7750           0.777189           0.785
               0.9            0.7800           0.767367           0.805
               1.0            0.7975           0.803686           0.795
