In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read Items
item_df = pd.read_csv('data/train/item_data.csv', index_col='item_id')
item_df.head()

Unnamed: 0_level_0,item_price,category_1,category_2,category_3,product_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26880,4602,11,35,20,3040
54939,3513,12,57,85,6822
40383,825,17,8,279,1619
8777,2355,13,58,189,5264
113705,1267,17,39,151,10239


In [3]:
# Reading view logs
user_log_df = pd.read_csv('data/train/view_log.csv', parse_dates=['server_time'])
user_log_df.device_type = user_log_df.device_type.astype('category')
user_log_df.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
0,2018-10-15 08:58:00,android,112333,4557,32970
1,2018-10-15 08:58:00,android,503590,74788,7640
2,2018-10-15 08:58:00,android,573960,23628,128855
3,2018-10-15 08:58:00,android,121691,2430,12774
4,2018-10-15 08:58:00,android,218564,19227,28296


In [4]:
# Merging items and view_logs
expanded_user_logs = pd.merge(user_log_df, item_df, how='inner', on='item_id')
expanded_user_logs.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id,item_price,category_1,category_2,category_3,product_type
0,2018-10-15 08:58:00,android,112333,4557,32970,54685,16,56,253,3184
1,2018-10-15 09:36:00,android,783457,88320,32970,54685,16,56,253,3184
2,2018-10-15 10:59:00,android,6902,1711,32970,54685,16,56,253,3184
3,2018-10-15 11:31:00,android,61138,58906,32970,54685,16,56,253,3184
4,2018-10-15 12:03:00,android,441653,64221,32970,54685,16,56,253,3184


In [5]:
def most_frequent(x):
    return x.value_counts().index[0]

In [10]:
aggregator = {
    'item_price': [np.max, np.min],
    'item_id': ['count','nunique'],
    'session_id': 'nunique',
    'product_type': most_frequent,
    'category_1': most_frequent,
    'category_2': most_frequent,
    'category_3': most_frequent,
    'device_type': most_frequent
}

In [11]:
cols = ['max_price','min_price','total_items','total_unique_items','total_sessions','freq_product_type'
        ,'freq_category_1','freq_category_2','freq_category_3','freq_device']
user_details = expanded_user_logs.groupby('user_id').agg(aggregator)
user_details.columns = cols

for col in ['freq_product_type','freq_category_1','freq_category_2','freq_category_3','freq_device']:
    user_details[col] = user_details[col].astype('category')

user_details['freq_device'] = user_details['freq_device'].replace({'android':0,'iphone':1,'web':2})

user_details.head()

Unnamed: 0_level_0,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,92160,332,42,18,11,7363,14,29,199,0
1,12595,383,8,8,3,7391,11,76,147,0
2,281536,128,165,130,37,8127,1,42,279,0
3,16640,537,8,3,1,6659,14,62,157,0
4,58252,1977,2,2,1,4426,14,3,27,0


In [12]:
user_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89152 entries, 0 to 92586
Data columns (total 10 columns):
max_price             89152 non-null int64
min_price             89152 non-null int64
total_items           89152 non-null int64
total_unique_items    89152 non-null int64
total_sessions        89152 non-null int64
freq_product_type     89152 non-null category
freq_category_1       89152 non-null category
freq_category_2       89152 non-null category
freq_category_3       89152 non-null category
freq_device           89152 non-null object
dtypes: category(4), int64(5), object(1)
memory usage: 5.5+ MB


In [13]:
user_details.to_csv('data/train/user_details.csv')