In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

# Load dataset
file_path = 'text_preprocessing/final_dataset.csv'
data = pd.read_csv(file_path)

# Extract features and labels
X = data['tweet_tokens_stemmed'].apply(lambda x: ' '.join(eval(x)))
y = data['Sentiment']

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

# Naive Bayes Multinomial Classifier with K-Fold Cross-Validation
classifier = MultinomialNB()
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
predictions = cross_val_predict(classifier, X_train_counts, y_train, cv=kf)
conf_matrix = confusion_matrix(y_train, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Print accuracy
accuracy = accuracy_score(y_train, predictions)
print("Accuracy:", accuracy)


Confusion Matrix:
[[ 83  74]
 [ 13 150]]
Accuracy: 0.728125


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
from pyswarm import pso

# Load dataset
file_path = 'text_preprocessing/final_dataset.csv'
data = pd.read_csv(file_path)

# Extract features and labels
X = data['tweet_tokens_stemmed'].apply(lambda x: ' '.join(eval(x)))  # Convert lists of tokens to space-separated strings
y = data['Sentiment']

# Convert text data to matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X)

# Define kf (StratifiedKFold) before using it in cross_val_predict
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Naive Bayes Multinomial Classifier with K-Fold Cross-Validation
def objective_function(selected_features):
    classifier = MultinomialNB()
    predictions = cross_val_predict(classifier, X_train_counts[:, selected_features], y, cv=kf)
    conf_matrix = confusion_matrix(y, predictions)
    accuracy = accuracy_score(y, predictions)
    print("Accuracy:", accuracy)
    return -accuracy  # Minimize negative accuracy

# Lower bounds for features (all False)
lb = np.zeros(X_train_counts.shape[1], dtype=int)
# Upper bounds for features (all True)
ub = np.ones(X_train_counts.shape[1], dtype=int)

best_features, _ = pso(objective_function, lb, ub, swarmsize=10, maxiter=10)
print("Best features:", np.where(best_features)[0])

# Evaluate final accuracy
final_selected_features = np.where(best_features)[0]
final_classifier = MultinomialNB()
final_predictions = cross_val_predict(final_classifier, X_train_counts[:, final_selected_features], y, cv=kf)
final_accuracy = accuracy_score(y, final_predictions)
print("Final accuracy after feature selection optimization:", final_accuracy)


Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 0.5025
Accuracy: 