In [2]:
import nltk
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')
nltk.download('punkt')  # Download the 'punkt' tokenizer models

# Load the dataset
data = pd.read_csv('model-dataset.csv')
data = data.dropna()


# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.notna(text):
        words = word_tokenize(str(text).lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
        return ' '.join(filtered_words)
    else:
        return ''  # Handle NaN values by returning an empty string


# Apply the preprocess_text function to the 'comment' column
data['comment'] = data['comment'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = data['comment']  # Use 'comment' as the column name
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Implement and evaluate existing sentiment analysis algorithms

# Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)

# Random Forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Support Vector Machine (SVM)
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Evaluate and compare performance
print("Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("Support Vector Machine (SVM):")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eranda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eranda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Naive Bayes:
Accuracy: 0.7262993188930478
              precision    recall  f1-score   support

    negative       0.88      0.43      0.58      7152
     neutral       0.84      0.66      0.74     11067
    positive       0.65      0.93      0.77     14375

    accuracy                           0.73     32594
   macro avg       0.79      0.67      0.69     32594
weighted avg       0.77      0.73      0.71     32594

Random Forest:
Accuracy: 0.865650119653924
              precision    recall  f1-score   support

    negative       0.87      0.70      0.78      7152
     neutral       0.84      0.96      0.90     11067
    positive       0.89      0.87      0.88     14375

    accuracy                           0.87     32594
   macro avg       0.86      0.85      0.85     32594
weighted avg       0.87      0.87      0.86     32594

Support Vector Machine (SVM):
Accuracy: 0.9018531017978769
              precision    recall  f1-score   support

    negative       0.88      0.81      