In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('text_preprocessing/final_dataset.csv')

# Konversi sentimen positif = 1, negatif = 0
def convert_label_sentimen(label_sentimen):
    if label_sentimen == "positive":
        return 1
    else:
        return 0

df['label_sentimen'] = df['Sentiment'].apply(convert_label_sentimen)
df.drop(df.columns[[0]], axis=1, inplace=True)

X = df['tweet_tokens_stemmed'] 
y = df['label_sentimen']

# K-fold cross validation (spliting data)
kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

for train_idx, val_idx in kfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[val_idx]

    # Vektorisasi text menggunakan TF-IDF
    tfidf_model = TfidfVectorizer(smooth_idf=False).fit(X_train)
    X_train = tfidf_model.transform(X_train).toarray()
    X_test =  tfidf_model.transform(X_test).toarray()
# Reset indeks untuk y_train
y_train.reset_index(drop=True, inplace=True)

# Fungsi untuk seleksi fitur menggunakan Particle Swarm Optimization
def pso_feature_selection(X_train_data, y_train_data, n_particles, n_iterations):
    num_features = X_train_data.shape[1]
    swarm_position = np.random.rand(n_particles, num_features) > 0.5
    swarm_velocity = np.random.rand(n_particles, num_features)
    swarm_best_position = swarm_position.copy()
    swarm_best_score = np.zeros(n_particles)

    global_best_position = np.zeros(num_features)
    global_best_score = 0

    for iteration in range(n_iterations):
        for particle in range(n_particles):
            # Hitung skor untuk partikel saat ini
            selected_features = swarm_position[particle]
            score = evaluate_features(X_train_data, y_train_data, selected_features, classifier_name)

            # Perbarui skor terbaik partikel jika perlu
            if score > swarm_best_score[particle]:
                swarm_best_score[particle] = score
                swarm_best_position[particle] = swarm_position[particle].copy()

            # Perbarui skor terbaik global jika perlu
            if score > global_best_score:
                global_best_score = score
                global_best_position = swarm_position[particle].copy()

            # Perbarui kecepatan dan posisi partikel
            inertia = 0.5
            cognitive_coefficient = 1
            social_coefficient = 2

            r1 = np.random.rand(num_features)
            r2 = np.random.rand(num_features)

            swarm_velocity[particle] = (inertia * swarm_velocity[particle] +
                            cognitive_coefficient * r1 * np.logical_xor(swarm_best_position[particle], swarm_position[particle]) +
                            social_coefficient * r2 * np.logical_xor(global_best_position, swarm_position[particle]))

            swarm_position[particle] += swarm_velocity[particle] > 0.5

    return global_best_position.astype(bool)

# Fungsi evaluasi fitur (misalnya, akurasi klasifikasi)
def evaluate_features(X_train, y_train, selected_features, classifier_name):
    selected_X_train = X_train[:, selected_features]  # Menggunakan slicing untuk mengambil fitur terpilih
    classifier = classifiers[classifier_name]()  # Ubah jenis klasifikasi jika perlu
    classifier.fit(selected_X_train, y_train)
    accuracy = classifier.score(selected_X_train, y_train)
    return accuracy

# Definisikan klasifikasi yang akan digunakan
classifiers = {
    "Multinomial": MultinomialNB,
    "Gaussian": GaussianNB,
    "Bernoulli": BernoulliNB
}

particle_counts = [10, 20, 30, 40]  # Jumlah partikel
iteration_counts = [20, 30, 40, 50]  # Jumlah iterasi

results = []

for classifier_name, classifier in classifiers.items():
    for n_particles in particle_counts:
        for n_iterations in iteration_counts:
            
            # Cross Validation
            kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
            accuracy_scores = []

            for train_index, test_index in kfold.split(X_train, y_train):
                X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
                y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]

                # Feature selection using PSO
                selected_indices = pso_feature_selection(X_train_fold, y_train_fold, n_particles, n_iterations)

                # Classification
                nb_classifier = classifier()
                nb_classifier.fit(X_train_fold[:, selected_indices], y_train_fold)
                predict_val_fold_selected = nb_classifier.predict(X_val_fold[:, selected_indices])

                # Model accuracy score
                accuracy_fold = accuracy_score(y_val_fold, predict_val_fold_selected)
                accuracy_scores.append(accuracy_fold)

            # Calculate average accuracy from k-fold cross validation
            average_accuracy = np.mean(accuracy_scores)

            # Save results
            results.append({
                'Classifier': classifier_name,
                'Particle count': n_particles,
                'Iteration count': n_iterations,
                'Average accuracy': average_accuracy * 100
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
print(results_df)

# Print best accuracy result
best_accuracy = results_df[results_df['Average accuracy'] == results_df['Average accuracy'].max()]
print("Best accuracy result:")
print(best_accuracy)


     Classifier  Particle count  Iteration count  Average accuracy
0   Multinomial              10               20         74.722222
1   Multinomial              10               30         74.444444
2   Multinomial              10               40         74.444444
3   Multinomial              10               50         75.000000
4   Multinomial              20               20         75.555556
5   Multinomial              20               30         75.833333
6   Multinomial              20               40         74.722222
7   Multinomial              20               50         75.277778
8   Multinomial              30               20         73.333333
9   Multinomial              30               30         75.833333
10  Multinomial              30               40         73.055556
11  Multinomial              30               50         75.277778
12  Multinomial              40               20         75.277778
13  Multinomial              40               30         75.00