In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import issparse

# Load data
df = pd.read_csv('text_preprocessing/final_dataset.csv')

# Convert sentiment labels to binary
def convert_label_sentimen(label_sentimen):
    return 1 if label_sentimen == "positive" else 0

df['label_sentimen'] = df['Sentiment'].apply(convert_label_sentimen)
df.drop(df.columns[[0]], axis=1, inplace=True)

X = df['tweet_tokens_stemmed']
y = df['label_sentimen']

# k-fold cross validation (splitting data)
kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

class NaiveBayes:
    def __init__(self, model_class):
        self.model_class = model_class
        self.clf = None
    
    def fit(self, X_train, y_train, selected_features):
        self.clf = self.model_class()
        if issparse(X_train):
            X_train = X_train.toarray()
        self.clf.fit(X_train[:, selected_features], y_train)
    
    def predict(self, X_test, selected_features):
        if self.clf is None:
            raise ValueError("Classifier not fitted. Please call 'fit' method first.")
        if issparse(X_test):
            X_test = X_test.toarray()
        return self.clf.predict(X_test[:, selected_features])

def evaluate_features(X_train, y_train, selected_features, classifier):
    clf = classifier
    cv_results = cross_val_score(clf.clf, X_train[:, selected_features], y_train, cv=kfold, scoring='accuracy')
    return cv_results.mean()

def display_selected_features(tfidf_model, selected_features):
    feature_names = np.array(tfidf_model.get_feature_names_out())
    selected_feature_names = feature_names[selected_features]
    print("Selected Features:")
    for feature in selected_feature_names:
        print(feature)
    return selected_feature_names

def pso_feature_selection(X_train, y_train, n_particles, inertia, global_weight, local_weight, model_class, tol=1e-5, patience=10):
    num_samples, num_features = X_train.shape
    bounds = [0, 1]
    
    num_particles = n_particles
    dimensions = num_features
    particles = np.random.rand(num_particles, dimensions)
    velocities = np.random.rand(num_particles, dimensions) * 0.1
    best_positions = particles.copy()
    best_scores = np.zeros(num_particles)

    global_best_position = np.zeros(dimensions)
    global_best_score = 0

    no_improvement_count = 0
    previous_global_best_score = 0

    iteration_results = []

    while no_improvement_count < patience:
        for particle in range(num_particles):
            r1 = np.random.rand(dimensions)
            r2 = np.random.rand(dimensions)
            velocities[particle] = (inertia * velocities[particle] +
                                    global_weight * r1 * (best_positions[particle] - particles[particle]) +
                                    local_weight * r2 * (global_best_position - particles[particle]))

            particles[particle] += velocities[particle]
            particles[particle] = np.clip(particles[particle], bounds[0], bounds[1])

            selected_features = particles[particle] > 0.5
            nb = NaiveBayes(model_class)
            nb.fit(X_train, y_train, selected_features)
            accuracy = evaluate_features(X_train, y_train, selected_features, nb)

            if accuracy > best_scores[particle]:
                best_scores[particle] = accuracy
                best_positions[particle] = particles[particle].copy()

            if accuracy > global_best_score:
                global_best_score = accuracy
                global_best_position = particles[particle].copy()

        iteration_results.append((global_best_score, global_best_position.copy()))

        if abs(global_best_score - previous_global_best_score) < tol:
            no_improvement_count += 1
        else:
            no_improvement_count = 0
        
        previous_global_best_score = global_best_score

    return global_best_position > 0.5, iteration_results

# Define parameters for PSO optimization
n_particles = 20
inertia = 0.8
global_weight = 1
local_weight = 0.9

results = []

for model_class in [MultinomialNB, BernoulliNB, GaussianNB]:
    accuracy_scoresTest = []
    accuracy_scoresTrain = []
    precision_scoresTest = []
    precision_scoresTrain = []
    recall_scoresTest = []
    recall_scoresTrain = []

    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        tfidf_model = TfidfVectorizer(smooth_idf=False)
        X_train_tfidf = tfidf_model.fit_transform(X_train)
        X_test_tfidf = tfidf_model.transform(X_test)

        X_train_dense = X_train_tfidf.toarray()
        selected_features, iteration_results = pso_feature_selection(X_train_dense, y_train, n_particles, inertia, global_weight, local_weight, model_class)

        if not np.any(selected_features):
            selected_features[0] = True

        nb = NaiveBayes(model_class)
        nb.fit(X_train_tfidf, y_train, selected_features)

        y_predTest = nb.predict(X_test_tfidf.toarray(), selected_features)

        accuracy_scoresTest.append(accuracy_score(y_test, y_predTest))
        precision_scoresTest.append(precision_score(y_test, y_predTest))
        recall_scoresTest.append(recall_score(y_test, y_predTest))

    fold_results = {
        "Model": model_class.__name__,
        "Average AccuracyTest": np.mean(accuracy_scoresTest),
        "Average PrecisionTest": np.mean(precision_scoresTest),
        "Average RecallTest": np.mean(recall_scoresTest),
    }

    results.append(fold_results)

    print(f"Results for {model_class.__name__}:")
    selected_feature_names = display_selected_features(tfidf_model, selected_features)
    
    print("PSO Iteration Results:")
    for i, (score, position) in enumerate(iteration_results):
        print(f"Iteration {i+1}: Best Score = {score}")

results_df = pd.DataFrame(results)

print(results_df.to_string(index=False))


Results for MultinomialNB:
Selected Features:
abang
abas
abi
abis
abraham
acara
acung
adem
adil
adu
agung
ahoax
ahok
ajah
ajar
ajarin
aksi
aku
ala
alam
alas
allah
alu
amal
aman
amat
amir
anak
anaktapi
analta
anastasya
aneh
anggota
angkat
aniaya
anis
anjay
antem
apa
apartemen
argonya
argumentasi
arti
ary
asa
asiknya
asli
atur
ayo
bagi
bagus
bahas
bahasa
baik
bajak
bakarin
bambang
banding
banget
bangga
bangkit
bangkrut
bangqomar
bangsa
bantai
bantu
banyak
barakallahu
bareng
basri
batas
batik
bawa
bayang
bayar
bebas
becanda
beda
bego
bela
belah
belanda
benar
benci
bentak
bentar
berani
berbelitbelit
berfikir
bermusyawara
bersih
besar
besok
biasa
biaya
bicara
bijak
bijaksana
bintang
biru
bj
bjh
bkin
bnn
bocor
bodoh
bohong
bom
bos
bosan
botak
bp
bpk
bravo
btp
buang
buk
buka
bukti
bully
busuk
buta
butuh
cagub
cak
campak
cantik
cawagub
cek
cewe
cinta
citra
club
comal
corbuzier
cowo
cuman
curhatanmu
daerah
dahlan
daki
dalam
dangdut
darmawan
dasar
deddy
dedy
dengar
dengerin
deportasi
desa
detik
