# Clustering Amazon Data

Using unsupervised ML 

References: 
- https://towardsdatascience.com/amazon-customer-analysis-57eabb66e2ed
- https://towardsdatascience.com/customer-segmentation-with-machine-learning-a0ac8c3d4d84


In [12]:
# Imports
import pandas as pd
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [14]:
meta_df = getDF('data/meta_Prime_Pantry.json.gz')
rows,columns = meta_df.shape
print('Number of rows: ',rows)
print('Number of columns: ',columns)
meta_df.head()

Number of rows:  10815
Number of columns:  11


Unnamed: 0,description,title,also_buy,image,brand,details,price,asin,also_view,rank,feature
0,[Sink your sweet tooth into MILK DUDS Candya d...,"HERSHEY'S Milk Duds Candy, 5 Ounce(Halloween C...","[B019KE37WO, B007NQSWEU]",[https://images-na.ssl-images-amazon.com/image...,Milk Duds,"\n <div class=""content"">\n\n\n\n\n\n\n<ul...",$5.00,B00005BPJO,,,
1,[Sink your sweet tooth into MILK DUDS Candya d...,"HERSHEY'S Milk Duds Candy, 5 Ounce(Halloween C...","[B019KE37WO, B007NQSWEU]",[https://images-na.ssl-images-amazon.com/image...,Milk Duds,"\n <div class=""content"">\n\n\n\n\n\n\n<ul...",$5.00,B00005BPJO,,,
2,[A perfect Lentil soup starts with Goya Lentil...,"Goya Dry Lentils, 16 oz","[B003SI144W, B000VDRKEK]",[https://images-na.ssl-images-amazon.com/image...,Goya,"\n <div class=""content"">\n\n\n\n\n\n\n<ul...",,B0000DIF38,"[B074MFVZG7, B079PTH69L, B000VDRKEK, B074M9T81...",,
3,[Saran Premium Wrap is an extra tough yet easy...,"Saran Premium Plastic Wrap, 100 Sq Ft","[B01MY5FHT6, B000PYF8VM, B000SRMDFA, B07CX6LN8...",[https://images-na.ssl-images-amazon.com/image...,Saran,"\n <div class=""content"">\n\n\n\n\n\n\n<ul...",,B0000DIWNI,"[B077QLSLRQ, B00JPKW1RQ, B000FE2IK6, B00XUJHJ9...",,
4,[200 sq ft (285 ft x 11-3/4 in x 18.6 m2). Eas...,"Saran Cling Plus Plastic Wrap, 200 Sq Ft",,[https://images-na.ssl-images-amazon.com/image...,Saran,"\n <div class=""content"">\n\n\n\n\n\n\n<ul...",,B0000DIWNZ,[B0014CZ0TE],,


In [15]:
product_df = getDF('data/Prime_Pantry.json.gz')
rows,columns = product_df.shape
print('Number of rows: ',rows)
print('Number of columns: ',columns)
product_df.head()

Number of rows:  471614
Number of columns:  12


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,5.0,True,"12 14, 2014",A1NKJW0TNRVS7O,B0000DIWNZ,Tamara M.,Good clinging,Clings well,1418515200,,,
1,4.0,True,"11 20, 2014",A2L6X37E8TFTCC,B0000DIWNZ,Amazon Customer,Fantastic buy and a good plastic wrap. Even t...,Saran could use more Plus to Cling better.,1416441600,,,
2,4.0,True,"10 11, 2014",A2WPR4W6V48121,B0000DIWNZ,noname,ok,Four Stars,1412985600,,,
3,3.0,False,"09 1, 2014",A27EE7X7L29UMU,B0000DIWNZ,ZapNZs,Saran Cling Plus is kind of like most of the C...,"The wrap is fantastic, but the dispensing, cut...",1409529600,4.0,,
4,4.0,True,"08 10, 2014",A1OWT4YZGB5GV9,B0000DIWNZ,Amy Rogers,This is my go to plastic wrap so there isn't m...,has been doing it's job for years,1407628800,,,


## Features

- Number of reviews
- Average rating
- Sentiment Ratio


### Sentiment Analysis

In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [18]:
for index, row in product_df.iterrows():
    
    try: 
        product_df.loc[index, 'sentiment_review'] = analyser.polarity_scores(row['reviewText'])['compound']
    except:
        product_df.loc[index, 'sentiment_review'] = 0.0

In [20]:
print(product_df.head())

   overall  verified   reviewTime      reviewerID        asin  \
0      5.0      True  12 14, 2014  A1NKJW0TNRVS7O  B0000DIWNZ   
1      4.0      True  11 20, 2014  A2L6X37E8TFTCC  B0000DIWNZ   
2      4.0      True  10 11, 2014  A2WPR4W6V48121  B0000DIWNZ   
3      3.0     False   09 1, 2014  A27EE7X7L29UMU  B0000DIWNZ   
4      4.0      True  08 10, 2014  A1OWT4YZGB5GV9  B0000DIWNZ   

      reviewerName                                         reviewText  \
0        Tamara M.                                      Good clinging   
1  Amazon Customer  Fantastic buy and a good plastic wrap.  Even t...   
2           noname                                                 ok   
3           ZapNZs  Saran Cling Plus is kind of like most of the C...   
4       Amy Rogers  This is my go to plastic wrap so there isn't m...   

                                             summary  unixReviewTime vote  \
0                                       Clings  well      1418515200  NaN   
1         Saran 

### Creating Features List

Now that we have the sentiments in place, we can create a dataframe with the features we chose

In [49]:
# Creating a test df to test our code
test_df = product_df[:50]
test_df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style,sentiment_review
0,5.0,True,"12 14, 2014",A1NKJW0TNRVS7O,B0000DIWNZ,Tamara M.,Good clinging,Clings well,1418515200,,,,0.4404
1,4.0,True,"11 20, 2014",A2L6X37E8TFTCC,B0000DIWNZ,Amazon Customer,Fantastic buy and a good plastic wrap. Even t...,Saran could use more Plus to Cling better.,1416441600,,,,0.802
2,4.0,True,"10 11, 2014",A2WPR4W6V48121,B0000DIWNZ,noname,ok,Four Stars,1412985600,,,,0.296
3,3.0,False,"09 1, 2014",A27EE7X7L29UMU,B0000DIWNZ,ZapNZs,Saran Cling Plus is kind of like most of the C...,"The wrap is fantastic, but the dispensing, cut...",1409529600,4.0,,,0.2743
4,4.0,True,"08 10, 2014",A1OWT4YZGB5GV9,B0000DIWNZ,Amy Rogers,This is my go to plastic wrap so there isn't m...,has been doing it's job for years,1407628800,,,,0.5657


In [74]:
# running average
def calc_running_avg(element, avg_so_far, total):
    return (avg_so_far*total + element)/(total+1)

# Method to create features df
def create_features(df):
    
    asins = []
    reviews = []
    avg_rating = []
    sentiment_ratio = []
    
    curr_asin = ""
    running_avg = 0.0
    running_sentiment = 0.0
    total = 0
    
    for index, row in df.iterrows():
        if index == 0: # first instance
            curr_asin = df.loc[index, 'asin']
            
        # we reach a new asin
        if df.loc[index, 'asin'] != curr_asin:

            asins.append(curr_asin)
            curr_asin = df.loc[index, 'asin']
            
            avg_rating.append(round(running_avg, 4))
            running_avg = 0.0
            
            sentiment_ratio.append(round(running_sentiment, 4))
            running_sentiment = 0.0
            
            reviews.append(total) # Reviews are the same as the total
            total = 0
            
        rating = df.loc[index, 'overall']
        sentiment = df.loc[index, 'sentiment_review']  
        
        running_avg = calc_running_avg(rating, running_avg, total)
        running_sentiment = calc_running_avg(sentiment, running_sentiment, total)
        total += 1
    
    # Adding the last values
    asins.append(curr_asin)
    avg_rating.append(round(running_avg, 4))
    sentiment_ratio.append(round(running_sentiment, 4))
    reviews.append(total)
                                        
    features = {'ID': asins,
                'Average_Rating': avg_rating,
                'Reviews': reviews,
                'Sentiment Ratio': sentiment_ratio
               }
                                        
    return features    
    

In [75]:
features = create_features(test_df)
features

{'ID': ['B0000DIWNZ', 'B0000DIWNI'],
 'Average_Rating': [3.7, 4.4],
 'Reviews': [10, 40],
 'Sentiment Ratio': [0.4922, 0.5148]}

In [76]:
# Now to work with actual dataframe
full_features = create_features(product_df)

AttributeError: 'dict' object has no attribute 'head'

In [77]:
full_features

{'ID': ['B0000DIWNZ',
  'B0000DIWNI',
  'B0000GH6UG',
  'B0001E1IN8',
  'B00032E8XK',
  'B00061EU6S',
  'B0006L0UVM',
  'B00099XO3W',
  'B00099XO5U',
  'B00099XPP4',
  'B00099XNXS',
  'B00099XP4A',
  'B00099XNXS',
  'B00099XP4A',
  'B00099XNXS',
  'B00099XP4A',
  'B00099XNXS',
  'B00099XPMC',
  'B00099XPD6',
  'B00099XPMC',
  'B00099XPMM',
  'B00099XPMC',
  'B00099XO50',
  'B00099XPMM',
  'B00099XPD6',
  'B00099XO50',
  'B00099XPD6',
  'B00099XO50',
  'B00099XPD6',
  'B00099XO50',
  'B00099XPD6',
  'B00099XO50',
  'B00099XPD6',
  'B00099XO50',
  'B00099XPD6',
  'B00099XPNQ',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  'B00099XPD6',
  'B00099XLXU',
  

In [79]:
finished_df = pd.DataFrame.from_dict(full_features)
finished_df

Unnamed: 0,ID,Average_Rating,Reviews,Sentiment Ratio
0,B0000DIWNZ,3.7000,10,0.4922
1,B0000DIWNI,4.2407,54,0.4727
2,B0000GH6UG,3.8205,39,0.5255
3,B0001E1IN8,4.7778,9,0.8000
4,B00032E8XK,4.0876,468,0.4127
...,...,...,...,...
29761,B01HI76312,4.3000,10,0.4467
29762,B01HI76790,4.1000,10,0.4203
29763,B01HI76XS0,4.6000,10,0.3767
29764,B01HI76SA8,5.0000,10,0.5086


In [80]:
finished_df.to_csv('data/cluster_features.csv', index=False)