## Import Dataset

In [0]:
import os
import json
import gzip
import pandas as pd
import numpy as np
from urllib.request import urlopen
from datetime import datetime

In [0]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz

--2020-04-12 23:32:28--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 169071325 (161M) [application/octet-stream]
Saving to: ‘Cell_Phones_and_Accessories_5.json.gz’


2020-04-12 23:32:43 (10.7 MB/s) - ‘Cell_Phones_and_Accessories_5.json.gz’ saved [169071325/169071325]



In [0]:
data = []
with gzip.open('Cell_Phones_and_Accessories_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

1128437
{'overall': 5.0, 'verified': True, 'reviewTime': '08 4, 2014', 'reviewerID': 'A24E3SXTC62LJI', 'asin': '7508492919', 'style': {'Color:': ' Bling'}, 'reviewerName': 'Claudia Valdivia', 'reviewText': 'Looks even better in person. Be careful to not drop your phone so often because the rhinestones will fall off (duh). More of a decorative case than it is protective, but I will say that it fits perfectly and securely on my phone. Overall, very pleased with this purchase.', 'summary': "Can't stop won't stop looking at it", 'unixReviewTime': 1407110400}


In [0]:
# convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)
#print(len(df))

In [0]:
#df3 = df.fillna(0)

In [0]:
df2 = df
df2['vote'] = df2['vote'].fillna('0')

In [0]:
df2['reviewTime'] = pd.to_datetime(df2['reviewTime'], format ='%m %d, %Y')

In [0]:
df2['image'] = np.where(df2['image'].isnull(), 0, 1)
df2['verified'] = np.where(df2['verified'] == True, 1, 0)

In [0]:
df2['vote'] = df2['vote'].astype(str) 
df2['vote'] = df2["vote"].str.replace(",","")
df2['vote'] = df2['vote'].astype(int) 

In [0]:
#df2.groupby(['asin','reviewerID']).size().sort_values(ascending=False).reset_index(name='count').drop_duplicates(subset='reviewerID')

## Prepare Influencers Dataset 
1000 rows : 500 influencers, 500 non-influencers

In [0]:
influencers = df2.sort_values('vote', ascending=False).drop_duplicates(['asin','reviewerID'],keep='last')
influencers['endDate'] = datetime(2018,10,2)
influencers['duration'] = influencers['endDate'] - influencers['reviewTime']
influencers['duration'] = influencers['duration'] / np.timedelta64(1, 'D')
influencers['durationYear'] = round(influencers['duration'] / 365 , 2)
influencers['avgVote'] = round(influencers['vote'] / influencers['durationYear'] , 0)
influencers.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,endDate,duration,durationYear,avgVote
733414,5.0,0,2015-08-21,AQJ824C2JST96,B00X5RV14Y,{'Color:': ' Black'},Gladius,Anker PowerCore 20100 Review by a Frequent Fl...,Anker PowerCore 20100 Review by a Frequent Fl...,1440115200,2038,1,2018-10-02,1138.0,3.12,653.0
998698,4.0,1,2016-07-14,A22QN0P5ERNOKR,B01F9N5QXI,,Paul M.,"<div id=""video-block-R1K03IBUILYEB0"" class=""a-...","The only thing ""cheap"" about this budget smart...",1468454400,1737,1,2018-10-02,810.0,2.22,782.0
909912,5.0,1,2016-02-05,A1BDNTBJ0JWXSL,B019O8YWXE,"{'Color:': ' Dark Grey', 'Style:': ' Phone Only'}",Wilfred,OVERVIEW\nThe Huawei Honor 5x is the first of ...,The first real budget phone thats worth the money,1454630400,1410,1,2018-10-02,970.0,2.66,530.0
694008,5.0,1,2015-04-30,A284QS51P9P9V1,B00UVSNVHA,{'Color:': ' Black'},TechnicianPrime,*INSTALL NOTE: No instructions included. Simp...,It's EXACTLY how you'd expect it to be and doe...,1430352000,1355,1,2018-10-02,1251.0,3.43,395.0
877008,3.0,0,2015-12-28,AT63Q5FKZEJ4I,B0176HQ1O8,,Jason S.,"Okay, first of all, I will leave the feel and ...",Good well rounded product.,1451260800,1239,1,2018-10-02,1009.0,2.76,449.0


In [0]:
#sum(1 for x in influencers['avgVote'] if x >=(40))
influencers = influencers.sort_values('avgVote', ascending=False)
nonInfluencers = influencers.iloc[500:,:].sample(n=500)
nonInfluencers['influencer'] = 0 
influencers = influencers.head(500)
influencers['influencer'] = 1
influencers = pd.concat([influencers, nonInfluencers], axis=0)

In [0]:
influencers['reviewTime'] = pd.DatetimeIndex(influencers['reviewTime'], dtype='datetime64[ns]', freq=None)
influencers = influencers.set_index('reviewTime')
influencers['Year'] = influencers.index.year
influencers['Month'] = influencers.index.month
influencers['Weekday'] = influencers.index.weekday

In [0]:
influencers.reset_index(inplace = True)
influencers.drop(['style', 'endDate','duration','durationYear','vote'], axis=1,inplace = True)
influencers

Unnamed: 0,reviewTime,overall,verified,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image,avgVote,influencer,Year,Month,Weekday
0,2016-07-14,4.0,1,A22QN0P5ERNOKR,B01F9N5QXI,Paul M.,"<div id=""video-block-R1K03IBUILYEB0"" class=""a-...","The only thing ""cheap"" about this budget smart...",1468454400,1,782.0,1,2016,7,3
1,2015-08-21,5.0,0,AQJ824C2JST96,B00X5RV14Y,Gladius,Anker PowerCore 20100 Review by a Frequent Fl...,Anker PowerCore 20100 Review by a Frequent Fl...,1440115200,1,653.0,1,2015,8,4
2,2016-02-05,5.0,1,A1BDNTBJ0JWXSL,B019O8YWXE,Wilfred,OVERVIEW\nThe Huawei Honor 5x is the first of ...,The first real budget phone thats worth the money,1454630400,1,530.0,1,2016,2,4
3,2016-04-28,4.0,1,A3CLNI0ID1OON8,B0196GQAKM,SonarTech,This review will touch quickly on the internal...,Internal Construction Details - Absolutely not...,1461801600,1,472.0,1,2016,4,3
4,2016-09-25,5.0,0,A2NB2E5DXE319Z,B01DRV2BBY,S. Lionel,"We've all seen the ads for Jitterbug phones, a...",The perfect phone for people who have trouble ...,1474761600,1,459.0,1,2016,9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2013-08-09,5.0,1,ALQ2VGWK0MBIS,B00B1HVLA4,K. Lee,A good quality car charger with two USB cables...,Quality charger at good price!!!,1376006400,1,3.0,0,2013,8,4
996,2015-09-06,5.0,1,AYWN5NVIV5VWI,B00YRYS4T4,Dioshy Cruz,Anker does it right again! I have to begin say...,Anker does it right again!,1441497600,1,0.0,0,2015,9,6
997,2016-11-16,5.0,1,A3KLX5ZZRVT31H,B00U1DCX56,techsavvykat,Seems to work fine.,As advertised,1479254400,1,0.0,0,2016,11,2
998,2017-06-17,5.0,0,A2X8K23L6OWSV5,B00Z7SD3V8,Oneluckyguy,One of my favorite of the series. The Symmetr...,One of my favorite of the series.,1497657600,1,0.0,0,2017,6,5


In [0]:
influencers["nb_words"] = influencers["reviewText"].apply(lambda x: len(x.split(" ")))
influencers.head()

Unnamed: 0,reviewTime,overall,verified,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image,avgVote,influencer,Year,Month,Weekday,nb_words
0,2016-07-14,4.0,1,A22QN0P5ERNOKR,B01F9N5QXI,Paul M.,"<div id=""video-block-R1K03IBUILYEB0"" class=""a-...","The only thing ""cheap"" about this budget smart...",1468454400,1,782.0,1,2016,7,3,574
1,2015-08-21,5.0,0,AQJ824C2JST96,B00X5RV14Y,Gladius,Anker PowerCore 20100 Review by a Frequent Fl...,Anker PowerCore 20100 Review by a Frequent Fl...,1440115200,1,653.0,1,2015,8,4,862
2,2016-02-05,5.0,1,A1BDNTBJ0JWXSL,B019O8YWXE,Wilfred,OVERVIEW\nThe Huawei Honor 5x is the first of ...,The first real budget phone thats worth the money,1454630400,1,530.0,1,2016,2,4,872
3,2016-04-28,4.0,1,A3CLNI0ID1OON8,B0196GQAKM,SonarTech,This review will touch quickly on the internal...,Internal Construction Details - Absolutely not...,1461801600,1,472.0,1,2016,4,3,1408
4,2016-09-25,5.0,0,A2NB2E5DXE319Z,B01DRV2BBY,S. Lionel,"We've all seen the ads for Jitterbug phones, a...",The perfect phone for people who have trouble ...,1474761600,1,459.0,1,2016,9,6,1973


In [0]:
# how those unformatted rows look like
influencers.iloc[0]

reviewTime                                      2016-07-14 00:00:00
overall                                                           4
verified                                                          1
reviewerID                                           A22QN0P5ERNOKR
asin                                                     B01F9N5QXI
reviewerName                                                Paul M.
reviewText        <div id="video-block-R1K03IBUILYEB0" class="a-...
summary           The only thing "cheap" about this budget smart...
unixReviewTime                                           1468454400
image                                                             1
avgVote                                                         782
influencer                                                        1
Year                                                           2016
Month                                                             7
Weekday                                         

##Export to CSV

In [0]:
influencers.to_csv('influencers.csv', index=False)

In [0]:
new_output = pd.read_csv('influencers.csv', keep_default_na=False, na_values=[""])

In [0]:
new_output

Unnamed: 0,reviewTime,overall,verified,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image,avgVote,influencer,Year,Month,Weekday,nb_words
0,2016-07-14,4.0,1,A22QN0P5ERNOKR,B01F9N5QXI,Paul M.,"<div id=""video-block-R1K03IBUILYEB0"" class=""a-...","The only thing ""cheap"" about this budget smart...",1468454400,1,782.0,1,2016,7,3,574
1,2015-08-21,5.0,0,AQJ824C2JST96,B00X5RV14Y,Gladius,Anker PowerCore 20100 Review by a Frequent Fl...,Anker PowerCore 20100 Review by a Frequent Fl...,1440115200,1,653.0,1,2015,8,4,862
2,2016-02-05,5.0,1,A1BDNTBJ0JWXSL,B019O8YWXE,Wilfred,OVERVIEW\nThe Huawei Honor 5x is the first of ...,The first real budget phone thats worth the money,1454630400,1,530.0,1,2016,2,4,872
3,2016-04-28,4.0,1,A3CLNI0ID1OON8,B0196GQAKM,SonarTech,This review will touch quickly on the internal...,Internal Construction Details - Absolutely not...,1461801600,1,472.0,1,2016,4,3,1408
4,2016-09-25,5.0,0,A2NB2E5DXE319Z,B01DRV2BBY,S. Lionel,"We've all seen the ads for Jitterbug phones, a...",The perfect phone for people who have trouble ...,1474761600,1,459.0,1,2016,9,6,1973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2013-08-09,5.0,1,ALQ2VGWK0MBIS,B00B1HVLA4,K. Lee,A good quality car charger with two USB cables...,Quality charger at good price!!!,1376006400,1,3.0,0,2013,8,4,74
996,2015-09-06,5.0,1,AYWN5NVIV5VWI,B00YRYS4T4,Dioshy Cruz,Anker does it right again! I have to begin say...,Anker does it right again!,1441497600,1,0.0,0,2015,9,6,196
997,2016-11-16,5.0,1,A3KLX5ZZRVT31H,B00U1DCX56,techsavvykat,Seems to work fine.,As advertised,1479254400,1,0.0,0,2016,11,2,4
998,2017-06-17,5.0,0,A2X8K23L6OWSV5,B00Z7SD3V8,Oneluckyguy,One of my favorite of the series. The Symmetr...,One of my favorite of the series.,1497657600,1,0.0,0,2017,6,5,26


In [0]:
type(new_output['Year'])

pandas.core.series.Series