In [1]:
!pip install vaderSentiment




In [2]:
# Importing all the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
# Loading the dataset into dataframe
reviews = pd.read_csv(r"/content/sample_data/restaurant_reviews_az.csv")

In [4]:
# Removing 3 star reviews from the input data and creating a new column 'Sentiment'
reviews = reviews[reviews['stars'] != 3]
reviews['Sentiment'] = reviews['stars'].apply(lambda x: 1 if x >= 4 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['Sentiment'] = reviews['stars'].apply(lambda x: 1 if x >= 4 else 0)


In [5]:
# Preparing training and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews['text'], reviews['Sentiment'], test_size=0.2, random_state=5)

In [6]:
# Conducting necessary data processing and use Count Vectorizer
count_vectorizer = CountVectorizer(max_features=1000)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [7]:
# Training Naive Bayes model with Count Vectorizer
nb_model_count = MultinomialNB()
nb_model_count.fit(X_train_count, y_train)
predictions_count = nb_model_count.predict(X_test_count)
accuracy_count = accuracy_score(y_test, predictions_count)
report_count = classification_report(y_test, predictions_count)
print(accuracy_count)
print(report_count)

0.9147295611747364
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      2409
           1       0.94      0.94      0.94      6410

    accuracy                           0.91      8819
   macro avg       0.89      0.89      0.89      8819
weighted avg       0.91      0.91      0.91      8819



In [8]:
# Training SVM model with Count Vectorizer
svm_model_count = SVC()
svm_model_count.fit(X_train_count, y_train)
svm_pred_count = svm_model_count.predict(X_test_count)
svm_accuracy_count = accuracy_score(y_test, svm_pred_count)
svm_report_count = classification_report(y_test, svm_pred_count)
print(svm_accuracy_count)
print(svm_report_count)

0.9425104887175416
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      2409
           1       0.96      0.96      0.96      6410

    accuracy                           0.94      8819
   macro avg       0.93      0.92      0.93      8819
weighted avg       0.94      0.94      0.94      8819



In [9]:
# TF-IDF vectorizer to represent the documents
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
# Training a naive bayes classifcation model with TF-IDF feature values
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train)
predictions_tfidf = nb_model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, predictions_tfidf)
report_tfidf = classification_report(y_test, predictions_tfidf)
print(accuracy_tfidf)
print(report_tfidf)

0.9072457194693276
              precision    recall  f1-score   support

           0       0.92      0.72      0.81      2409
           1       0.90      0.98      0.94      6410

    accuracy                           0.91      8819
   macro avg       0.91      0.85      0.87      8819
weighted avg       0.91      0.91      0.90      8819



In [11]:
# Training and evaluating the performance of SVM model with TF-IDF
svm_model_tfidf = SVC()
svm_model_tfidf.fit(X_train_tfidf, y_train)
predictions_svm_tfidf = svm_model_tfidf.predict(X_test_tfidf)
accuracy_svm_tfidf = accuracy_score(y_test, predictions_svm_tfidf)
report_svm_tfidf = classification_report(y_test, predictions_svm_tfidf)
print(accuracy_svm_tfidf)
print(report_svm_tfidf)

0.9524889443247534
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      2409
           1       0.97      0.97      0.97      6410

    accuracy                           0.95      8819
   macro avg       0.94      0.94      0.94      8819
weighted avg       0.95      0.95      0.95      8819



In [12]:
# Using VaderSentiment to predict review sentiment and evaluating its performance
analyzer = SentimentIntensityAnalyzer()
reviews['VaderSentiment'] = reviews['text'].apply(lambda x: 1 if analyzer.polarity_scores(x)['compound'] >= 0 else 0)
accuracy_vader = accuracy_score(reviews['Sentiment'], reviews['VaderSentiment'])
report_vader = classification_report(reviews['Sentiment'], reviews['VaderSentiment'])
print(accuracy_vader)
print(report_vader)

0.86605583652734
              precision    recall  f1-score   support

           0       0.94      0.56      0.70     12312
           1       0.85      0.99      0.91     31781

    accuracy                           0.87     44093
   macro avg       0.89      0.77      0.81     44093
weighted avg       0.88      0.87      0.85     44093

