In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_predict, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

# Load dataset
file_path = 'text_preprocessing/final_dataset.csv'
data = pd.read_csv(file_path)

# Extract features and labels
features = data['tweet_tokens_stemmed'].apply(lambda x: ' '.join(eval(x)))
labels = data['Sentiment']

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert text data to matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

# Naive Bayes Gaussian Classifier with K-Fold Cross-Validation and Confusion Matrix
classifier = GaussianNB()
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
predictions = cross_val_predict(classifier, X_train_counts.toarray(), y_train, cv=kf)
confusion_matrix_result = confusion_matrix(y_train, predictions)
print("Confusion Matrix:")
print(confusion_matrix_result)

# Print accuracy
accuracy = accuracy_score(y_train, predictions)
print("Accuracy:", accuracy)

Confusion Matrix:
[[ 86  71]
 [ 12 151]]
Accuracy: 0.740625


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from pyswarm import pso

# Load dataset
file_path = 'text_preprocessing/final_dataset.csv'
data = pd.read_csv(file_path)

# Extract features and labels
X = data['tweet_tokens_stemmed'].apply(lambda x: ' '.join(eval(x)))
y = data['Sentiment']
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X)

# Define kf (StratifiedKFold) before using it in cross_val_predict
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Naive Bayes Gaussian Classifier with K-Fold Cross-Validation
def objective_function(selected_features, X_train_counts=X_train_counts):
    classifier = GaussianNB()
    selected_features = np.where(selected_features)[0]
    predictions = cross_val_predict(classifier, X_train_counts[:, selected_features].toarray(), y, cv=kf)
    conf_matrix = confusion_matrix(y, predictions)
    accuracy = accuracy_score(y, predictions)
    return -accuracy  # Minimize negative accuracy

# Convert boolean arrays lb and ub to integer arrays
lb = np.zeros(X_train_counts.shape[1], dtype=int)
ub = np.ones(X_train_counts.shape[1], dtype=int)

best_features, _ = pso(objective_function, lb, ub, swarmsize=10, maxiter=10)
print("Best features:", np.where(best_features)[0])

# Final evaluation after feature selection optimization
final_selected_features = np.where(best_features)[0]
final_classifier = GaussianNB()
final_predictions = cross_val_predict(final_classifier, X_train_counts[:, final_selected_features].toarray(), y, cv=kf)
final_accuracy = accuracy_score(y, final_predictions)
print("Final accuracy after feature selection optimization:", final_accuracy)


Stopping search: maximum iterations reached --> 10
Best features: [   0    1    2 ... 1254 1255 1256]
Final accuracy after feature selection optimization: 0.7525
