Megan Sisson
DSC 680 Applied Data Science
Project 3


Customer Review using Vader to understand the sentiment of text.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

import collections


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report



In [None]:
#nltk.download('vader_lexicon')

In [None]:
df = pd.read_csv('/Users/mksis/Documents/Data Science/680 Applied DS/Reviews.csv')
df.head()

In [None]:
#Column names
df.columns

In [None]:
# Checking for null values
df.isnull().sum()

In [None]:
#Seeing row for null value from reviewerName
df[df['reviewerName'].isna() == True]

In [None]:
#Grouping reviews by reviewTime to see the frequency of reviews each day.
df['reviewTime'] = pd.to_datetime(df['reviewTime'])
df_group = df.groupby('reviewTime').count()

df_group.head()

In [None]:
#Number of reviews by time
df_group.plot(y = 'reviewerID', use_index = True, ylabel = 'Count', title = 'Count of Reviews Over Time')

In [None]:
# Count Review rating [overall]
rating = df['overall'].value_counts()
rating

In [None]:
#Graph for Ratings
rating.plot.bar(title = 'Rating Counts')
plt.show()

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
#Convert reviewText2 to string
df['reviewText'] = df['reviewText'].apply(str)

In [None]:
#Clean the reviewText
#lowercase text
df['reviewText2'] = df['reviewText'].str.lower()
#remove punctuation
df['reviewText2'] = df['reviewText2'].str.replace('[^\w\s]', '')

df.head()

In [None]:
#sentiment analyzer. Adding polarity columns
df['polarity'] = df['reviewText2'].apply(lambda x: sia.polarity_scores(x))

df

In [None]:
#transform polarity column into columns for negative, neutral, positive, and compound

df[['neg', 'neu', 'pos', 'compound']] = df['polarity'].apply(pd.Series)
df

In [None]:
#Visualize compound score compared to review score
ax = sb.barplot(data = df, x = 'overall', y = 'compound')
ax.set_title('Compound Score by Amazon Overall Score')
plt.show()
#This plot makes sense, as the compound score is low, it should also correlate to a low overall score
# and a higher compound score should correlate to a higher overall score

In [None]:
#Visualize overall score to the negative, neutral, and positive scores
fig, axs = plt.subplots(1, 3, figsize = (15, 5))
sb.barplot(data = df, x = 'overall', y = 'pos', ax=axs[0])
sb.barplot(data = df, x = 'overall', y = 'neu', ax=axs[1])
sb.barplot(data = df, x = 'overall', y = 'neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
#Confirms what we hope to see
#Positive polarity increases as overall score increases
#Negative polarity increases as overall score increases

In [None]:
#Scatter plot to see relationship between overall score and compound score
df.plot.scatter(x = 'overall', y = 'neg')

In [None]:
#Dropping columns not needed for model
df2 = df.drop(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'summary', 'unixReviewTime', 'reviewTime', 'day_diff',
       'helpful_yes', 'total_vote', 'reviewText2', 'polarity'], axis = 1)
df2.head()

In [None]:
#Converting 'overall' to string
df2['overall'] = df2['overall'].astype(str)

In [None]:
#Split data into X(independent) and y(dependent)
X = df2.drop('overall', axis = 1)
y = df2.overall
print("X shape: ", X.shape)
print('y shape: ', y.shape)

In [None]:
#Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 42)
print('Training shape: ', X_train.shape)
print('Testing shape: ', X_test.shape)


In [None]:
#Fitting the model Random Forest Classification
classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

In [None]:
#Make Predictions
y_pred = classifier.predict(X_test)
y_pred

In [None]:
#Graph Actual vs Predicted
plt.scatter(y_test, y_pred)
plt.xlabel('Actual overall score')
plt.ylabel('Predicted overall score')
plt.title('Actual vs Predicted overall score')
plt.show()

In [None]:
#Accuracy Score
acc = accuracy_score(y_test, y_pred)
print("Accuracy score: ", acc)

In [None]:
#Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
#Confusion Matrix Heatmap
sb.heatmap(cm, annot = True, fmt = 'd', xticklabels = ['1', '2', '3', '4', '5'], 
           yticklabels = ['1', '2', '3', '4', '5'], cmap = plt.cm.Greens)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Random Forest Confusion Matrix')
plt.show()

In [None]:
#Classification Report
report = classification_report(y_test, y_pred)
print(report)