In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import awswrangler as wr

In [2]:
bucket = 'ecommerceanalysis'
dir = 'archive/Nov/'
path = f's3://{bucket}/{dir}'
data = wr.s3.read_csv(path=path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67501979 entries, 0 to 67501978
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 4.5+ GB


In [3]:
data['event_time'] = data['event_time'].str[:19]

In [4]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-11-01 00:00:00,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
3,2019-11-01 00:00:01,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
4,2019-11-01 00:00:01,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [5]:
data.isnull().sum()

event_time              0
event_type              0
product_id              0
category_id             0
category_code    21898171
brand             9224078
price                   0
user_id                 0
user_session           10
dtype: int64

In [6]:
data.dropna(how = 'any', inplace = True)
data.isnull().sum()

event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64

In [7]:
data.groupby("event_type")["event_type"].count().sort_values(ascending=False)

event_type
view        39315226
cart         2115082
purchase      659256
Name: event_type, dtype: int64

In [8]:
#Dropping columns not relevent to our analysis
data = data.drop(columns=['user_id',"category_id",'user_session'])

In [9]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,price
0,2019-11-01 00:00:00,view,1003461,electronics.smartphone,xiaomi,489.07
1,2019-11-01 00:00:00,view,5000088,appliances.sewing_machine,janome,293.65
3,2019-11-01 00:00:01,view,3601530,appliances.kitchen.washer,lg,712.87
4,2019-11-01 00:00:01,view,1004775,electronics.smartphone,xiaomi,183.27
5,2019-11-01 00:00:01,view,1306894,computers.notebook,hp,360.09


In [10]:
#checking data size after null values have been dropped
data.shape

(42089564, 6)

In [11]:
data['event_type'].unique()

array(['view', 'cart', 'purchase'], dtype=object)

In [12]:
#Breaking data into 3 dataframes
view_data = data[data['event_type'] == 'view']
view_data = view_data.drop(columns=['event_type'])
view_data.head()

Unnamed: 0,event_time,product_id,category_code,brand,price
0,2019-11-01 00:00:00,1003461,electronics.smartphone,xiaomi,489.07
1,2019-11-01 00:00:00,5000088,appliances.sewing_machine,janome,293.65
3,2019-11-01 00:00:01,3601530,appliances.kitchen.washer,lg,712.87
4,2019-11-01 00:00:01,1004775,electronics.smartphone,xiaomi,183.27
5,2019-11-01 00:00:01,1306894,computers.notebook,hp,360.09


In [13]:
view_data.shape

(39315226, 5)

In [14]:
#Breaking data into 3 dataframes
purchase_data = data[data['event_type'] == 'purchase']
purchase_data = purchase_data.drop(columns=['event_type'])
purchase_data.head()

Unnamed: 0,event_time,product_id,category_code,brand,price
168,2019-11-01 00:01:04,1005161,electronics.smartphone,xiaomi,211.92
707,2019-11-01 00:04:51,1004856,electronics.smartphone,samsung,128.42
939,2019-11-01 00:06:33,1801881,electronics.video.tv,samsung,488.8
942,2019-11-01 00:06:34,5800823,electronics.audio.subwoofer,nakamichi,123.56
1107,2019-11-01 00:07:38,30000218,construction.tools.welding,magnetta,254.78


In [15]:
#Checking data size
purchase_data.shape

(659256, 5)

In [16]:
#Breaking data into 3 dataframes
cart_data = data[data['event_type'] == 'cart']
cart_data = cart_data.drop(columns=['event_type'])
cart_data.head()

Unnamed: 0,event_time,product_id,category_code,brand,price
40,2019-11-01 00:00:14,1005014,electronics.smartphone,samsung,503.09
502,2019-11-01 00:03:24,1801881,electronics.video.tv,samsung,488.8
537,2019-11-01 00:03:39,1005115,electronics.smartphone,apple,949.47
849,2019-11-01 00:05:54,1002542,electronics.smartphone,apple,486.8
956,2019-11-01 00:06:38,1004856,electronics.smartphone,samsung,128.42


In [17]:
cart_data.shape

(2115082, 5)

In [18]:
view_data['category_code'].value_counts()

electronics.smartphone          14811764
computers.notebook               2087503
electronics.video.tv             2059009
electronics.clocks               1726221
electronics.audio.headphone      1632332
                                  ...   
apparel.shorts                       435
construction.tools.screw             155
appliances.kitchen.fryer              96
country_yard.furniture.bench           2
apparel.jacket                         1
Name: category_code, Length: 129, dtype: int64

In [19]:
purchase_data['category_code'].value_counts()

electronics.smartphone           382492
electronics.audio.headphone       40742
electronics.video.tv              30178
electronics.clocks                21426
appliances.kitchen.washer         19680
                                  ...  
country_yard.furniture.hammok         4
construction.tools.soldering          3
apparel.shorts                        2
construction.tools.screw              1
appliances.kitchen.fryer              1
Name: category_code, Length: 126, dtype: int64

In [20]:
cart_data['category_code'].value_counts()

electronics.smartphone           1159323
electronics.audio.headphone       130819
electronics.video.tv              105931
appliances.kitchen.washer          69911
appliances.environment.vacuum      64592
                                  ...   
country_yard.furniture.hammok         18
apparel.shorts                        10
apparel.shoes.espadrilles              8
appliances.kitchen.fryer               8
construction.tools.screw               1
Name: category_code, Length: 127, dtype: int64

In [21]:
# export to csv
# view_data.to_csv('../e_commerce_analysis_temp/csv/view_Nov_2019.csv',encoding='utf-8',index=False) # over 2 GB still
purchase_data.to_csv('../e_commerce_analysis_temp/csv/purchase_Nov_2019.csv',encoding='utf-8',index=False)
cart_data.to_csv('../e_commerce_analysis_temp/csv/cart_Nov_2019.csv',encoding='utf-8',index=False)