In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import re
import numpy as np
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import spacy
import pickle

In [35]:
df=pd.read_csv('amazon_alexa.tsv', sep='\t')

In [36]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [37]:
df.describe()

Unnamed: 0,rating,feedback
count,3150.0,3150.0
mean,4.463175,0.918413
std,1.068506,0.273778
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [38]:
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

In [39]:
df['review_length']=df['verified_reviews'].apply(len)
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,review_length
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,13
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,9
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,195
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,172
4,5,31-Jul-18,Charcoal Fabric,Music,1,5


In [40]:
df.groupby('rating').describe()


Unnamed: 0_level_0,feedback,feedback,feedback,feedback,feedback,feedback,feedback,feedback,review_length,review_length,review_length,review_length,review_length,review_length,review_length,review_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1,161.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,161.0,195.0,212.371226,1.0,36.0,120.0,284.0,1124.0
2,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,249.1875,269.595303,1.0,78.75,163.0,309.25,1686.0
3,152.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,152.0,207.296053,272.194706,1.0,54.0,130.0,284.0,1954.0
4,455.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,455.0,178.520879,215.927849,1.0,34.0,99.0,241.0,1360.0
5,2286.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2286.0,108.444007,152.116387,1.0,26.0,64.0,135.0,2851.0


In [41]:
ratings=df['rating'].value_counts()

fig = px.funnel_area(names=ratings.index,
                    values=ratings.values,
                    title='Distribution of Alexa Ratings')
fig.show()

In [42]:
feedback=df['feedback'].value_counts()
fig = px.pie(df, values=feedback.values, names=feedback.index,
            title='Distribution of Feedback')
fig.show()

In [43]:
reviews=px.histogram(df,x="review_length",title='Distribution of Length of Reviews',labels={"review_length":"Review Length"})
reviews.show()


In [44]:
reviews=px.histogram(df,x="review_length",color='feedback',title='Distribution of Length of Reviews by Feedback',labels={"review_length":"Review Length"})
reviews.show()


In [92]:
fig = px.bar(df, x="rating", y="review_length") 
fig.show()

# Data Cleaning

In [46]:
# INITIALIZE SPACY
nlp = spacy.load('en', disable=['parser', 'ner'])

In [47]:
# MAKE LOWERCASE
df['new_reviews'] = df['verified_reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['new_reviews'].head()

0                                        love my echo!
1                                            loved it!
2    sometimes while playing a game, you can answer...
3    i have had a lot of fun with this thing. my 4 ...
4                                                music
Name: new_reviews, dtype: object

In [48]:
# REMOVE PUNCTUATION
df['new_reviews'] = df['new_reviews'].str.replace('[^\w\s]','')
df['new_reviews'].head()

0                                         love my echo
1                                             loved it
2    sometimes while playing a game you can answer ...
3    i have had a lot of fun with this thing my 4 y...
4                                                music
Name: new_reviews, dtype: object

In [49]:
# REMOVE EMOJI
#REFERENCE : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [50]:
df['new_reviews'] = df['new_reviews'].apply(lambda x: remove_emoji(x))

In [51]:
# REMOVE STOP WORDS
stop = stopwords.words('english')
df['new_reviews'] = df['new_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['new_reviews'].sample(30)

85                                                       
2081                              good product like thank
799     love love new little gadget made lives much ea...
39      first digital assistant im giving good review ...
3115                                   loud thought going
2134                      great product listen music time
1008    love still learning makes lot things easier li...
1565                            great sound video quality
94      love thinking getting one prime day discount o...
878                                                      
1002                   impressed look clarity sound color
2056                                             personal
1036                                    alexa hardly came
2437                                   good wifi tv stick
2670    love got boyfriend birthday kids blast asking ...
1129    love device great great alarm clock surprising...
406                                                      
1672          

In [52]:
# LEMMATIZATION
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])   

df['new_reviews']= df['new_reviews'].apply(space)

In [53]:
# CREATE NEW COLUMN "SENTIMENT" WHERE RATINGS 4 OR HIGHER ARE POSITIVE AND 3 AND BELOW ARE NEGATIVE
df['sentiment'] = np.where(df['rating']>= 4, 'positive', 'negative')


In [54]:
pickle.dump(df, open("alexa_reviews_clean.pkl", "wb" ))