In [8]:
import nltk
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')
nltk.download('punkt')  # Download the 'punkt' tokenizer models

# Load the dataset
data = pd.read_csv('model-dataset.csv')

# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

data['comment'] = data['comment'].apply(preprocess_text)  # Use 'comment' as the column name

# Split the dataset into training and testing sets
X = data['comment']  # Use 'comment' as the column name
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Implement and evaluate existing sentiment analysis algorithms

# Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)

# Random Forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Support Vector Machine (SVM)
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Evaluate and compare performance
print("Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("Support Vector Machine (SVM):")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eranda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eranda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Naive Bayes:
Accuracy: 0.625
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
     neutral       0.62      1.00      0.77        15
    positive       0.00      0.00      0.00         6

    accuracy                           0.62        24
   macro avg       0.21      0.33      0.26        24
weighted avg       0.39      0.62      0.48        24

Random Forest:
Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
     neutral       0.65      1.00      0.79        15
    positive       1.00      0.17      0.29         6

    accuracy                           0.67        24
   macro avg       0.55      0.39      0.36        24
weighted avg       0.66      0.67      0.56        24

Support Vector Machine (SVM):
Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

    negative       1.00      0.33      0.50        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
