In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score

# Load dataset
file_path = 'text_preprocessing/final_dataset.csv'
data = pd.read_csv(file_path)

# Extract features and labels
features = data['tweet_tokens_stemmed'].apply(lambda x: ' '.join(eval(x)))  
labels = data['Sentiment']

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert text data to matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

# Naive Bayes Bernouilli Classifier with K-Fold Cross-Validation and Confusion Matrix
classifier = BernoulliNB()
kf = KFold(n_splits=10, shuffle=True, random_state=42)
predictions = cross_val_predict(classifier, X_train_counts, y_train, cv=kf)
confusion_matrix_result = confusion_matrix(y_train, predictions)
print("Confusion Matrix:")
print(confusion_matrix_result)

# Print accuracy
accuracy = accuracy_score(y_train, predictions)
print("Accuracy:", accuracy)

Confusion Matrix:
[[ 77  80]
 [ 10 153]]
Accuracy: 0.71875


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score
from pyswarms.single.global_best import GlobalBestPSO
from sklearn.model_selection import KFold, cross_val_predict

# Load dataset
file_path = 'text_preprocessing/final_dataset.csv'
data = pd.read_csv(file_path)

# Extract features and labels
features = data['tweet_tokens_stemmed'].apply(lambda x: ' '.join(eval(x)))
labels = data['Sentiment']

# Convert text data to matrix of token counts
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(features)

# Define fitness function for PSO
def fitness_function(selected_features, X, y):
    clf = BernoulliNB()
    # Convert the input array to 2 dimensions
    X_2d = X.toarray()[:, selected_features.astype(int)].reshape(X.shape[0], -1)
    # Fit the classifier
    clf.fit(X_2d, y)
    y_pred = clf.predict(X_2d)
    return -accuracy_score(y, y_pred)  # PSO minimizes the objective function, so we use negative accuracy

# Perform PSO feature selection
optimizer = GlobalBestPSO(n_particles=10, dimensions=X_counts.shape[1], options={'c1': 0.5, 'c2': 0.3, 'w': 0.9})
cost, pos = optimizer.optimize(fitness_function, iters=100, X=X_counts, y=labels)

# Get selected features
selected_features = np.where(pos > 0.5)[0]

# Naive Bayes Bernoulli Classifier with K-Fold Cross-Validation and Confusion Matrix
classifier = BernoulliNB()
kf = KFold(n_splits=10, shuffle=True, random_state=42)
predictions = cross_val_predict(classifier, X_counts.toarray()[:, selected_features.astype(int)], labels, cv=kf)
confusion_matrix_result = confusion_matrix(labels, predictions)
print("Confusion Matrix:")
print(confusion_matrix_result)

# Print accuracy
accuracy = accuracy_score(labels, predictions)
print("Accuracy:", accuracy)


2024-03-12 21:57:17,954 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best: 100%|██████████|100/100, best_cost=-.505
2024-03-12 21:57:51,200 - pyswarms.single.global_best - INFO - Optimization finished | best cost: -0.505, best pos: [1.97866082 1.00967799 1.60002618 ... 0.78182437 1.58544769 1.89645821]


Confusion Matrix:
[[107  93]
 [ 14 186]]
Accuracy: 0.7325
