In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('text_preprocessing/final_dataset.csv')

# Konversi sentimen positif = 1, negatif = 0
def convert_label_sentimen(label_sentimen):
    if label_sentimen == "positive":
        return 1
    else:
        return 0

df['label_sentimen'] = df['Sentiment'].apply(convert_label_sentimen)
df.drop(df.columns[[0]], axis=1, inplace=True)

X = df['tweet_tokens_stemmed'] 
y = df['label_sentimen']

# K-fold cross validation (splitting data)
kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Function to perform PSO feature selection
def pso_feature_selection(X_train, y_train, n_particles, n_iterations, num_features):
    num_samples, num_features = X_train.shape
    bounds = [0, 1]  # Assuming the feature selection is binary (selected or not)
    inertia = 0.8
    cognitive_weight = 1.5
    social_weight = 1.5

    # Initialize PSO
    num_particles = n_particles
    dimensions = num_features
    particles = np.random.rand(num_particles, dimensions)
    velocities = np.random.rand(num_particles, dimensions)
    best_positions = particles.copy()
    best_scores = np.zeros(num_particles)

    global_best_position = np.zeros(dimensions)
    global_best_score = 0

    for iteration in range(n_iterations):
        for particle in range(num_particles):
            # Evaluate fitness (accuracy) of current particle
            selected_features = particles[particle] > 0.5
            accuracy = evaluate_features(X_train, y_train, selected_features)

            # Update particle's best position and score
            if accuracy > best_scores[particle]:
                best_scores[particle] = accuracy
                best_positions[particle] = particles[particle].copy()

            # Update global best position and score
            if accuracy > global_best_score:
                global_best_score = accuracy
                global_best_position = particles[particle].copy()

            # Update particle's velocity
            r1 = np.random.rand(dimensions)
            r2 = np.random.rand(dimensions)
            velocities[particle] = (inertia * velocities[particle] +
                                   cognitive_weight * r1 * (best_positions[particle] - particles[particle]) +
                                   social_weight * r2 * (global_best_position - particles[particle]))

            # Update particle's position
            particles[particle] += velocities[particle]
            particles[particle] = np.clip(particles[particle], bounds[0], bounds[1])

    return global_best_position > 0.5

# Function to evaluate features using selected features and classifier
def evaluate_features(X_train, y_train, selected_features):
    X_selected = X_train[:, selected_features]
    clf = MultinomialNB()  # You can change the classifier here if needed
    cv_results = cross_val_score(clf, X_selected, y_train, cv=kfold, scoring='accuracy')
    return cv_results.mean()

# Definisikan klasifikasi yang akan digunakan
classifiers = {
    "Multinomial": MultinomialNB,
    "Gaussian": GaussianNB,
    "Bernoulli": BernoulliNB
}

particle_counts = [10, 20, 30]  # Jumlah partikel
iteration_counts = [60, 80, 100]  # Jumlah iterasi

results = []

for classifier_name, classifier in classifiers.items():
    for n_particles in particle_counts:
        for n_iterations in iteration_counts:
            accuracy_scores = []  # Inisialisasi list untuk menyimpan skor akurasi
            for train_idx, test_idx in kfold.split(X, y):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                # TF-IDF Vectorization
                tfidf_model = TfidfVectorizer(smooth_idf=False).fit(X_train)
                X_train_tfidf = tfidf_model.transform(X_train).toarray()
                X_test_tfidf = tfidf_model.transform(X_test).toarray()
                
                # Feature selection using PSO
                selected_features = pso_feature_selection(X_train_tfidf, y_train, n_particles, n_iterations, num_features=X_train_tfidf.shape[1])

                # Classification
                nb_classifier = classifier()
                nb_classifier.fit(X_train_tfidf[:, selected_features], y_train)
                predict_val_fold_selected = nb_classifier.predict(X_test_tfidf[:, selected_features])

                # Model accuracy score
                accuracy_fold = accuracy_score(y_test, predict_val_fold_selected)
                accuracy_scores.append(accuracy_fold)

            # Calculate average accuracy from k-fold cross validation
            average_accuracy = np.mean(accuracy_scores)

            # Save results
            results.append({
                'Classifier': classifier_name,
                'Particle count': n_particles,
                'Iteration count': n_iterations,
                'Average accuracy': average_accuracy * 100
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
print(results_df)

# Print best accuracy result
best_accuracy = results_df[results_df['Average accuracy'] == results_df['Average accuracy'].max()]
print("Best accuracy result:")
print(best_accuracy)


     Classifier  Particle count  Iteration count  Average accuracy
0   Multinomial              10               60             79.00
1   Multinomial              10               80             78.75
2   Multinomial              10              100             77.50
3   Multinomial              20               60             78.75
4   Multinomial              20               80             76.75
5   Multinomial              20              100             80.25
6   Multinomial              30               60             79.00
7   Multinomial              30               80             78.50
8   Multinomial              30              100             80.25
9      Gaussian              10               60             70.75
10     Gaussian              10               80             72.50
11     Gaussian              10              100             70.25
12     Gaussian              20               60             70.50
13     Gaussian              20               80             7