In [7]:
# Import all dependencies
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Import the necessary libraries for Naive Bayes, Count Vectorization (BoW) and TfidfVectorizer (TD-IDF)
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')

# Data introduction
df = pd.read_csv('data.csv', delimiter=',', encoding='latin-1', header=None)
df = df.rename(columns={0: 'Sentiment', 1: 'Text'})
df = df[['Text', 'Sentiment']]
df = df[df['Sentiment'] != 'neutral']

# Data preprocessing
def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    cleaned_text = ' '.join(words)
    return cleaned_text

df['Text'] = df['Text'].apply(preprocess_text)

X = df['Text']
y = df['Sentiment']

# Feature engineering
count_vectorizer = CountVectorizer(max_features=10000)

X_train_bow = count_vectorizer.fit_transform(X)

X_test_bow = count_vectorizer.transform(X)

tfidf_vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X)

X_test_tfidf = tfidf_vectorizer.transform(X)

# Define the number of folds (k)
n_splits = 10

# Initialize the StratifiedKFold cross-validator
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define the classifiers and vectorizers
classifiers = [MultinomialNB(), LogisticRegression()]
vectorizers = [count_vectorizer, tfidf_vectorizer]

# Iterate over classifiers and vectorizers
for classifier in classifiers:
    for vectorizer in vectorizers:
        print(f"Classifier: {classifier.__class__.__name__}, Vectorizer: {vectorizer.__class__.__name__}")

        # Initialize lists to store evaluation metrics for each fold
        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        for train_index, test_index in kf.split(X, y):
            X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
            y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

            X_train_vectorized = vectorizer.transform(X_train_fold)
            X_test_vectorized = vectorizer.transform(X_test_fold)

            model = classifier
            model.fit(X_train_vectorized, y_train_fold)

            y_test_pred = model.predict(X_test_vectorized)

            accuracy = accuracy_score(y_test_fold, y_test_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(y_test_fold, y_test_pred, average='weighted')

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        # Calculate and print the mean and standard deviation of evaluation metrics across folds
        print("Mean Accuracy:", sum(accuracy_scores) / len(accuracy_scores))
        print("Mean Precision:", sum(precision_scores) / len(precision_scores))
        print("Mean Recall:", sum(recall_scores) / len(recall_scores))
        print("Mean F1 Score:", sum(f1_scores) / len(f1_scores))


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/suonieo1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/suonieo1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Classifier: MultinomialNB, Vectorizer: CountVectorizer
Mean Accuracy: 0.8200300424738423
Mean Precision: 0.8310983730138647
Mean Recall: 0.8200300424738423
Mean F1 Score: 0.8228867956818174
Classifier: MultinomialNB, Vectorizer: TfidfVectorizer
Mean Accuracy: 0.7788433647570704
Mean Precision: 0.8180054118325544
Mean Recall: 0.7788433647570704
Mean F1 Score: 0.7351499834803418
Classifier: LogisticRegression, Vectorizer: CountVectorizer
Mean Accuracy: 0.8454729099761732
Mean Precision: 0.8433085437615547
Mean Recall: 0.8454729099761732
Mean F1 Score: 0.8399839218166084
Classifier: LogisticRegression, Vectorizer: TfidfVectorizer
Mean Accuracy: 0.8093701439966849
Mean Precision: 0.8176857955586021
Mean Recall: 0.8093701439966849
Mean F1 Score: 0.7888340276438868
