In [1]:
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, log_loss
import plotly.express as px
import plotly.graph_objects as go


In [2]:
with open('alexa_reviews_clean.pkl','rb') as read_file:
    df = pickle.load(read_file)

In [3]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,review_length,new_reviews,sentiment
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,13,love echo,positive
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,9,love,positive
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,195,sometimes play game answer question correctly ...,positive
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,172,lot fun thing 4 yr old learn dinosaur control ...,positive
4,5,31-Jul-18,Charcoal Fabric,Music,1,5,music,positive


# Variations

In [4]:
df['variation'].value_counts()

Black  Dot                      516
Charcoal Fabric                 430
Configuration: Fire TV Stick    350
Black  Plus                     270
Black  Show                     265
Black                           261
Black  Spot                     241
White  Dot                      184
Heather Gray Fabric             157
White  Spot                     109
White                            91
Sandstone Fabric                 90
White  Show                      85
White  Plus                      78
Oak Finish                       14
Walnut Finish                     9
Name: variation, dtype: int64

In [5]:
# REMOVE FIRE STICK AS IT IS NOT AN ALEXA DEVICE
df=df[df.variation!='Configuration: Fire TV Stick']
df['variation'].value_counts()

Black  Dot              516
Charcoal Fabric         430
Black  Plus             270
Black  Show             265
Black                   261
Black  Spot             241
White  Dot              184
Heather Gray Fabric     157
White  Spot             109
White                    91
Sandstone Fabric         90
White  Show              85
White  Plus              78
Oak Finish               14
Walnut Finish             9
Name: variation, dtype: int64

In [6]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 20)


  """Entry point for launching an IPython kernel.


In [7]:
# VARIATION "WHITE" AND "BLACK" - LIKELY BELONG TO ECHO DOT
df[df['variation']=='White'].sample(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback,review_length,new_reviews,sentiment
645,5,27-May-18,White,Works great!,1,12,work great,positive
521,1,21-Jun-18,White,"Two weeks after setting it up it no longer works. No connection, no light, no sound. I guess I shouldn't have purchased a refurbished Dot.",0,140,two week set long work connection light sound guess should not purchase refurbish dot,negative
425,5,10-Jul-18,White,We have a few echos already. Went for this one in refurbished to save a few bucks. It works great. I love this new generation one because it has a rubber bottom - nice.,1,168,echo already go one refurbish save buck work great love new generation one rubber bottom nice,positive
519,5,21-Jun-18,White,Love the echo dot,1,17,love echo dot,positive
577,5,10-Jun-18,White,Works well and has no obvious issues from being refurbished.,1,60,work well obvious issue refurbish,positive
411,5,14-Jul-18,White,everything perfect,1,18,everything perfect,positive
655,5,25-May-18,White,WORKS GREAT,1,11,work great,positive
501,5,24-Jun-18,White,"I am happy with the refurbished echo dot. I’ve had it for about two weeks, and so far it is working perfectly! Even though it did not come in its original box, it looks like new! I’m happy with this purchase!",1,208,happy refurbish echo dot -PRON- have two week far work perfectly even though come original box look like new -PRON- be happy purchase,positive
652,5,26-May-18,White,I love it,1,9,love,positive
625,5,31-May-18,White,Works great and simple to set up. Alexa is way more accurate than I imagined so Im using it more than I thought. Wish alarms could be set a bit further out but thats my only complaint.No she's not listening when you don't say the wake word so if you're worried you can lose the conspiracy theories. The speaker is great too so don't hold off on getting one if you were turned off by gen 1 like me.,1,397,work great simple set alexa way accurate imagine -PRON- be use thought wish alarm could set bit that s complaintno -PRON- s listen do not say wake word -PRON- be worry lose conspiracy theory speaker great do not hold get one turn gen 1 like,positive


In [8]:
# CHANGE VARIATION NAMES TO NAME OF ECHO MODELS

# ECHO 2nd Gen - charcoal fabric, heather gray fabric, 
# sandstone fabric, oak finish, walnut finish
df['model']=np.where(df.variation.str.contains('Charcoal Fabric ') |
                     df.variation.str.contains('Heather Gray Fabric ') |
                     df.variation.str.contains('Sandstone Fabric ') |
                     df.variation.str.contains('Oak Finish ') |
                     df.variation.str.contains('Walnut Finish '),'echo',df['variation'])

# ECHO DOT - black dot, white dot, black, white
df['model']=np.where(df.variation.str.contains('Black  Dot') |
                    df.variation.str.contains('White  Dot') |
                    df.variation.str.contains('Black') |
                    df.variation.str.contains('White'), 'echo dot', df['model'])

# ECHO SHOW - black show, white show
df['model']=np.where(df.variation.str.contains('Black  Show') |
                    df.variation.str.contains('White  Show'), 'echo show', df['model'])

# ECHO PLUS - black plus, white plus
df['model']=np.where(df.variation.str.contains('Black  Plus') |
                    df.variation.str.contains('White  Plus'), 'echo plus', df['model'])

# ECHO SPOT - black spot, white spot
df['model']=np.where(df.variation.str.contains('Black  Spot') |
                    df.variation.str.contains('White  Spot'), 'echo spot', df['model'])


In [9]:
df['model'].value_counts()

echo dot     1052
echo         700 
echo spot    350 
echo show    350 
echo plus    348 
Name: model, dtype: int64

In [10]:
pickle.dump(df, open("alexa_models.pkl", "wb" ))

# Pickle Each Echo Model

In [22]:
echo=df[df['model']=='echo']
pickle.dump(echo,open("echo.pkl","wb"))

In [27]:
echo['rating'].value_counts()

5    551
4    99 
3    30 
2    14 
1    6  
Name: rating, dtype: int64

In [23]:
echo_dot=df[df['model']=='echo dot']
pickle.dump(echo_dot,open("echodot.pkl","wb"))

In [28]:
echo_dot['rating'].value_counts()

5    723
4    167
1    75 
3    62 
2    25 
Name: rating, dtype: int64

In [24]:
echo_spot=df[df['model']=='echo spot']
pickle.dump(echo_spot,open("echospot.pkl","wb"))

In [30]:
echo_spot['rating'].value_counts()

5    241
4    48 
1    27 
3    17 
2    17 
Name: rating, dtype: int64

In [25]:
echo_show=df[df['model']=='echo show']
pickle.dump(echo_show,open("echoshow.pkl","wb"))

In [31]:
echo_show['rating'].value_counts()

5    247
4    57 
1    18 
3    17 
2    11 
Name: rating, dtype: int64

In [26]:
echo_plus=df[df['model']=='echo plus']
pickle.dump(echo_plus,open("echoplus.pkl","wb"))

In [32]:
echo_plus['rating'].value_counts()

5    242
4    50 
1    22 
3    20 
2    14 
Name: rating, dtype: int64

# Visualizations

In [39]:
values=df['model'].value_counts()
fig = go.Figure(data=[go.Bar(x=values.index, y=values, text=values, textposition='auto')])
fig.update_xaxes(title_text='Echo Models')
fig.update_yaxes(title_text='Number of Models')
fig.update_layout(title_text='Distribution of Echo Models')


fig.show()

In [41]:
values=echo['rating'].value_counts()
fig = go.Figure(data=[go.Bar(x=values.index, y=values, text=values, textposition='auto')])
fig.update_xaxes(title_text='Ratings')
fig.update_yaxes(title_text='Number of Ratings')
fig.update_layout(title_text='Distribution of Echo Ratings')


fig.show()

In [42]:
values=echo_dot['rating'].value_counts()
fig = go.Figure(data=[go.Bar(x=values.index, y=values, text=values, textposition='auto')])
fig.update_xaxes(title_text='Ratings')
fig.update_yaxes(title_text='Number of Ratings')
fig.update_layout(title_text='Distribution of Echo Dot Ratings')


fig.show()

In [43]:
values=echo_spot['rating'].value_counts()
fig = go.Figure(data=[go.Bar(x=values.index, y=values, text=values, textposition='auto')])
fig.update_xaxes(title_text='Ratings')
fig.update_yaxes(title_text='Number of Ratings')
fig.update_layout(title_text='Distribution of Echo Spot Ratings')


fig.show()

In [44]:
values=echo_plus['rating'].value_counts()
fig = go.Figure(data=[go.Bar(x=values.index, y=values, text=values, textposition='auto')])
fig.update_xaxes(title_text='Ratings')
fig.update_yaxes(title_text='Number of Ratings')
fig.update_layout(title_text='Distribution of Echo Plus Ratings')


fig.show()

In [40]:
values=echo_show['rating'].value_counts()
fig = go.Figure(data=[go.Bar(x=values.index, y=values, text=values, textposition='auto')])
fig.update_xaxes(title_text='Ratings')
fig.update_yaxes(title_text='Number of Ratings')
fig.update_layout(title_text='Distribution of Echo Show Ratings')


fig.show()