In [1]:
import gc
import sys
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 10000)

sys.path.append(str(Path('.').resolve().parents[0]))
from src.data import JDataTrainTestReadWriter

In [2]:
rw = JDataTrainTestReadWriter()
rw.show(key='all_merged')
%time train, test = rw.read(key='all_merged', version="1.0", encode_type=True)

input:
  folder: interim
  name: all_merged.csv
output:
  folder: interim
  test: all_merged_test.csv
  train: all_merged_train.npz
  train_only_index: true
setting:
  end: '2016-04-09 00:00:00'
  mid: '2016-04-02 00:00:00'
  start: '2016-02-02 00:00:00'
  time_col: unix_action
  unix_time: true
version: '1.0'

CPU times: user 47.8 s, sys: 5.2 s, total: 53 s
Wall time: 53.1 s


In [3]:
train.head()

Unnamed: 0,user_id,sku_id,model_id,type,cate,brand,year,month,day,week_action,hour,unix_action,age,sex,user_lv_cd,year_user_reg,month_user_reg,day_user_reg,unix_user_reg,a1,a2,a3,comment_num,has_bad_comment,bad_comment_rate,week_comment,unix_comment,browse,add_cart,rem_cart,purchase,follow,click
283617,209665,15168,0,6,6,78,2016,2,2,5,0,1454371200,2,0,5,2008,11,27,1227744000,-1,-1,-1,0,0,0.0,0,0,0,0,0,0,0,1
283618,239179,36638,-1,2,5,479,2016,2,2,5,0,1454371200,1,0,5,2011,11,6,1320537600,-1,-1,-1,0,0,0.0,0,0,0,1,0,0,0,0
283619,288741,31662,21,6,8,545,2016,2,2,5,0,1454371200,2,1,4,2015,8,30,1440892800,1,1,1,0,0,0.0,0,0,0,0,0,0,0,1
283620,209665,15168,-1,1,6,78,2016,2,2,5,0,1454371200,2,0,5,2008,11,27,1227744000,-1,-1,-1,0,0,0.0,0,0,1,0,0,0,0,0
283621,267945,117882,-1,1,4,519,2016,2,2,5,0,1454371201,2,2,4,2012,7,6,1341532800,-1,-1,-1,0,0,0.0,0,0,1,0,0,0,0,0


In [4]:
# keep base columns to be merged by feature groups
used_cols = ['user_id', 'age', 'sex',
       'user_lv_cd', 'year_user_reg', 'month_user_reg', 'day_user_reg',
       'unix_user_reg', 'sku_id', 'cate', 'brand', 'a1', 'a2', 'a3']

%time train_pair = train[used_cols + ['purchase']].groupby(used_cols, as_index=False).max()

CPU times: user 7.48 s, sys: 1.66 s, total: 9.14 s
Wall time: 9.14 s


In [7]:
len(train_pair)

3070668

In [6]:
train_pair.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,year_user_reg,month_user_reg,day_user_reg,unix_user_reg,sku_id,cate,brand,a1,a2,a3,purchase
0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0
1,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0
2,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0
3,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0
4,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0


## Basic Feature - Time

用戶

時間跨度
- 最後一筆 訂單 / 目標品類訂單 距今時間
- 最後一筆目標品類訂單與最後一筆訂單的時間差
- 第一筆 訂單 / 目標品類訂單 距今時間
- 第一筆與最後一筆目標品類時間差
- 最後一次瀏覽、關注目標品類距今時間 (TODO)

---

商品

時間間隔
- 目標品類訂單間時間間隔統計數值
- 目標品類行為間時間間隔統計數值 (TODO)

### 用戶: 時間跨度

In [8]:
end_time = pd.to_datetime('2016-04-02 00:00:00')
end_time_unix = end_time.value // 10 ** 9

In [None]:
# user, first_purchase_to_now, last_purchase_to_now
# user, sku, first_purchase_to_now, last_purchase_to_now, first_to_last_purchase_time_diff, last_browse_to_now, last_follow_to_now

In [35]:
used_cols = ['user_id', 'sku_id', 'unix_action', 'purchase', 'browse', 'follow']

user_sku_purchase_tf = train[train.purchase == 1][used_cols] \
                        .groupby(['user_id', 'sku_id']) \
                        .unix_action.agg(['min', 'max']) \
                        .reset_index() \
                        .rename(columns={'min': 'first_ts', 'max': 'last_ts'})
user_purchase_tf = user_sku_purchase_tf.groupby('user_id').agg({'first_ts': 'min', 'last_ts': 'max'}).reset_index()

In [44]:
# add columns
def add_time_diff_features(df, ts):
    df['first_diff'] = ts - df.first_ts
    df['last_diff'] = ts - df.last_ts
    df['first_last_diff'] = df.last_ts - df.first_ts
    return df

user_purchase_tf = add_time_diff_features(user_purchase_tf, end_time_unix)
user_sku_purchase_tf = add_time_diff_features(user_sku_purchase_tf, end_time_unix)
user_sku_purchase_merge_tf = user_sku_purchase_tf.merge(user_purchase_tf, on='user_id', suffixes=['', '_by_user'])

user_sku_purchase_merge_tf['last_last_sku_diff'] = user_sku_purchase_merge_tf.last_ts_by_user - user_sku_purchase_merge_tf.last_ts
user_sku_purchase_merge_tf['first_first_sku_diff'] = user_sku_purchase_merge_tf.first_ts - user_sku_purchase_merge_tf.first_ts_by_user

In [45]:
user_sku_purchase_merge_tf.head()

Unnamed: 0,user_id,sku_id,first_ts,last_ts,first_diff,last_diff,first_last_diff,last_ts_by_user,first_ts_by_user,first_diff_by_user,last_diff_by_user,first_last_diff_by_user,last_last_sku_diff,first_first_sku_diff
0,200001,20308,1459108440,1459108440,446760,446760,0,1459108440,1459108440,446760,446760,0,0,0
1,200005,72967,1456268400,1456268400,3286800,3286800,0,1456268400,1456268400,3286800,3286800,0,0,0
2,200015,168842,1459162680,1459162680,392520,392520,0,1459162680,1459162680,392520,392520,0,0,0
3,200017,2143,1457878380,1457878380,1676820,1676820,0,1457878380,1457878380,1676820,1676820,0,0,0
4,200017,2148,1457878380,1457878380,1676820,1676820,0,1457878380,1457878380,1676820,1676820,0,0,0


In [46]:
len(user_sku_purchase_merge_tf)

34979

In [52]:
user_sku_purchase_merge_tf[user_sku_purchase_merge_tf.first_last_diff > 0].head()

Unnamed: 0,user_id,sku_id,first_ts,last_ts,first_diff,last_diff,first_last_diff,last_ts_by_user,first_ts_by_user,first_diff_by_user,last_diff_by_user,first_last_diff_by_user,last_last_sku_diff,first_first_sku_diff
25,200077,88295,1458042480,1458042780,1512720,1512420,300,1459002720,1458039900,1515300,552480,962820,959940,2580
29,200077,170311,1458039900,1459002720,1515300,552480,962820,1459002720,1458039900,1515300,552480,962820,0,0
31,200089,94373,1455559920,1456422660,3995280,3132540,862740,1456422660,1455559860,3995340,3132540,862800,0,60
35,200092,7196,1456226100,1456509060,3329100,3046140,282960,1457967120,1454427180,5128020,1588080,3539940,1458060,1798920
42,200092,99415,1454763900,1457942280,4791300,1612920,3178380,1457967120,1454427180,5128020,1588080,3539940,24840,336720


In [53]:
# validate one of the same sku first_last_diff > 0 is correct
train[(train.user_id == 200077) & (train.sku_id == 88295) & (train.purchase == 1)]

Unnamed: 0,user_id,sku_id,model_id,type,cate,brand,year,month,day,week_action,hour,unix_action,age,sex,user_lv_cd,year_user_reg,month_user_reg,day_user_reg,unix_user_reg,a1,a2,a3,comment_num,has_bad_comment,bad_comment_rate,week_comment,unix_comment,browse,add_cart,rem_cart,purchase,follow,click
18382236,200077,88295,-1,4,8,489,2016,3,15,11,11,1458042480,2,2,5,2013,4,11,1365638400,3,2,1,0,0,0.0,0,0,0,0,0,1,0,0
18383377,200077,88295,-1,4,8,489,2016,3,15,11,11,1458042540,2,2,5,2013,4,11,1365638400,3,2,1,0,0,0.0,0,0,0,0,0,1,0,0
18387490,200077,88295,-1,4,8,489,2016,3,15,11,11,1458042780,2,2,5,2013,4,11,1365638400,3,2,1,0,0,0.0,0,0,0,0,0,1,0,0


### 商品: 時間間隔

In [None]:
sku_purchase_interval = train[train.purchase == 1][['sku_id', 'unix_action']].sort_values('unix_action')
sku_purchase_interval['unix_action_lag'] = sku_purchase_interval.groupby('sku_id').unix_action.shift(1)
sku_purchase_interval['diff'] = sku_purchase_interval.unix_action - sku_purchase_interval.unix_action_lag
sku_purchase_interval_summary = sku_purchase_interval.groupby('sku_id').diff.describe().reset_index()

In [115]:
sku_purchase_interval.sku_id.value_counts()[:10]

154636    447
63006     305
12564     268
31662     245
111391    210
57018     201
54357     201
52343     195
32465     175
35464     168
Name: sku_id, dtype: int64

In [114]:
sku_purchase_interval[sku_purchase_interval.sku_id==52343].head()

Unnamed: 0,sku_id,unix_action,unix_action_lag,diff
696721,52343,1454527740,,
2104371,52343,1455207300,1454528000.0,679560.0
2167842,52343,1455225540,1455207000.0,18240.0
2184958,52343,1455229140,1455226000.0,3600.0
3637668,52343,1455644580,1455229000.0,415440.0


In [None]:
# groupby -> describe -> dataframe
# https://stackoverflow.com/questions/33575587/pandas-dataframe-how-to-apply-describe-to-each-group-and-add-to-new-columns
# df = pd.DataFrame(group.describe().rename(columns={'score':name}).squeeze()
#                          for name, group in df.groupby('name'))
# df.groupby('name').describe().reset_index().pivot(index='name', values='score', columns='level_1')
# df.groupby('name').describe().unstack(1)

In [146]:
sku_purchase_interval_summary.head()

Unnamed: 0,sku_id,count,mean,std,min,25%,50%,75%,max
0,52,0.0,,,,,,,
1,156,14.0,292560.0,375610.537085,4500.0,90030.0,163320.0,281700.0,1389180.0
2,169,2.0,140460.0,111242.038816,61800.0,101130.0,140460.0,179790.0,219120.0
3,211,20.0,205413.0,204545.93866,5520.0,29280.0,141270.0,330150.0,707520.0
4,275,0.0,,,,,,,


### 用戶: 時間間隔

In [170]:
user_purchase_interval = train[train.purchase == 1][['user_id', 'unix_action']].sort_values('unix_action')
user_purchase_interval['unix_action_lag'] = user_purchase_interval.groupby(['user_id']).unix_action.shift(1)
user_purchase_interval['diff'] = user_purchase_interval.unix_action - user_purchase_interval.unix_action_lag
user_purchase_interval_summary = user_purchase_interval.groupby(['user_id']).diff.describe().reset_index()

In [None]:
user_purchase_interval_summary = user_purchase_interval_summary[user_purchase_interval_summary['count'] > 0]

In [174]:
user_purchase_interval_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7350 entries, 3 to 22863
Data columns (total 9 columns):
user_id    7350 non-null int64
count      7350 non-null float64
mean       7350 non-null float64
std        2958 non-null float64
min        7350 non-null float64
25%        7350 non-null float64
50%        7350 non-null float64
75%        7350 non-null float64
max        7350 non-null float64
dtypes: float64(8), int64(1)
memory usage: 574.2 KB


### 用戶商品: 時間間隔

In [157]:
user_sku_purchase_interval = train[train.purchase == 1][['user_id', 'sku_id', 'unix_action']].sort_values('unix_action')
user_sku_purchase_interval['unix_action_lag'] = user_sku_purchase_interval.groupby(['user_id', 'sku_id']).unix_action.shift(1)
user_sku_purchase_interval['diff'] = user_sku_purchase_interval.unix_action - user_sku_purchase_interval.unix_action_lag
user_sku_purchase_interval_summary = user_sku_purchase_interval.groupby(['user_id', 'sku_id']).diff.describe().reset_index()

In [162]:
user_sku_purchase_interval_summary = user_sku_purchase_interval_summary[user_sku_purchase_interval_summary['count'] > 0]

In [163]:
user_sku_purchase_interval_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 832 entries, 25 to 34970
Data columns (total 10 columns):
user_id    832 non-null int64
sku_id     832 non-null int64
count      832 non-null float64
mean       832 non-null float64
std        134 non-null float64
min        832 non-null float64
25%        832 non-null float64
50%        832 non-null float64
75%        832 non-null float64
max        832 non-null float64
dtypes: float64(8), int64(2)
memory usage: 71.5 KB


In [164]:
user_sku_purchase_interval_summary.head()

Unnamed: 0,user_id,sku_id,count,mean,std,min,25%,50%,75%,max
25,200077,88295,2.0,150.0,127.279221,60.0,105.0,150.0,195.0,240.0
29,200077,170311,1.0,962820.0,,962820.0,962820.0,962820.0,962820.0,962820.0
31,200089,94373,1.0,862740.0,,862740.0,862740.0,862740.0,862740.0,862740.0
35,200092,7196,1.0,282960.0,,282960.0,282960.0,282960.0,282960.0,282960.0
42,200092,99415,4.0,794595.0,668327.149306,117180.0,345375.0,716550.0,1165770.0,1628100.0


In [178]:
time_based_features = sku_purchase_interval_summary \
                        .merge(user_sku_purchase_merge_tf, on='sku_id') \
                        .merge(user_purchase_interval_summary, on='user_id', how='left', suffixes=['', '_u']) \
                        .merge(user_sku_purchase_interval_summary, on=['user_id', 'sku_id'], how='left', suffixes=['', '_by_us'])

In [179]:
time_based_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34979 entries, 0 to 34978
Data columns (total 38 columns):
sku_id                     34979 non-null int64
count                      34979 non-null float64
mean                       33483 non-null float64
std                        32407 non-null float64
min                        33483 non-null float64
25%                        33483 non-null float64
50%                        33483 non-null float64
75%                        33483 non-null float64
max                        33483 non-null float64
user_id                    34979 non-null int64
first_ts                   34979 non-null int64
last_ts                    34979 non-null int64
first_diff                 34979 non-null int64
last_diff                  34979 non-null int64
first_last_diff            34979 non-null int64
last_ts_by_user            34979 non-null int64
first_ts_by_user           34979 non-null int64
first_diff_by_user         34979 non-null int64
last_diff_b

In [180]:
time_based_features.head()

Unnamed: 0,sku_id,count,mean,std,min,25%,50%,75%,max,user_id,first_ts,last_ts,first_diff,last_diff,first_last_diff,last_ts_by_user,first_ts_by_user,first_diff_by_user,last_diff_by_user,first_last_diff_by_user,last_last_sku_diff,first_first_sku_diff,count_u,mean_u,std_u,min_u,25%_u,50%_u,75%_u,max_u,count_by_us,mean_by_us,std_by_us,min_by_us,25%_by_us,50%_by_us,75%_by_us,max_by_us
0,52,0.0,,,,,,,,218499,1456069380,1456069380,3485820,3485820,0,1457780280,1456069380,3485820,1774920,1710900,1710900,0,2.0,855450.0,1209789.0,0.0,427725.0,855450.0,1283175.0,1710900.0,,,,,,,,
1,156,14.0,292560.0,375610.537085,4500.0,90030.0,163320.0,281700.0,1389180.0,214876,1459007580,1459007580,547620,547620,0,1459007580,1459007580,547620,547620,0,0,0,,,,,,,,,,,,,,,,
2,156,14.0,292560.0,375610.537085,4500.0,90030.0,163320.0,281700.0,1389180.0,219323,1458922140,1458922140,633060,633060,0,1458922140,1455960420,3594780,633060,2961720,0,2961720,1.0,2961720.0,,2961720.0,2961720.0,2961720.0,2961720.0,2961720.0,,,,,,,,
3,156,14.0,292560.0,375610.537085,4500.0,90030.0,163320.0,281700.0,1389180.0,239483,1457911800,1457911800,1643400,1643400,0,1458948840,1457776080,1779120,606360,1172760,1037040,135720,5.0,234552.0,452334.3,0.0,0.0,180.0,135720.0,1036860.0,,,,,,,,
4,156,14.0,292560.0,375610.537085,4500.0,90030.0,163320.0,281700.0,1389180.0,242497,1455990540,1455990540,3564660,3564660,0,1459097760,1454408760,5146440,457440,4689000,3107220,1581780,22.0,213136.4,267429.5,0.0,43140.0,123150.0,281055.0,1132080.0,,,,,,,,


In [181]:
time_based_features.to_csv('../data/interim/time_based_features_v1.csv', index=False)

## Group-By-Aggregation

In [277]:
DEFAULT_ACTIONS = ['browse', 'add_cart', 'rem_cart', 'purchase', 'follow', 'click']

# unique ratio speed
# slow when data size small
count_unique = lambda x: len(x) / x.nunique()
# speed when data size small
UNIQUE_RATIO = 'ratio'

AGGREGATION_RULES = [
    # V1 - Basic Features
    
    # single column
    # user behavior total action counts
    {'groupby': ['user_id'], 'select': 'sku_id', 'agg': 'count'},
    # Unique Ratio
    {'groupby': ['user_id'], 'select': 'sku_id', 'agg': count_unique, 'agg_name': 'AvgSkuPerDistinct'},
    # age total counts
    {'groupby': ['age'], 'select': 'sku_id', 'agg': 'count'},
    # sex total counts
    {'groupby': ['sex'], 'select': 'sku_id', 'agg': 'count'},
    # user_lv_cd total counts
    {'groupby': ['user_lv_cd'], 'select': 'sku_id', 'agg': 'count'},
    # sku_id interaction counts
    {'groupby': ['sku_id'], 'select': 'user_id', 'agg': 'count'},
    {'groupby': ['sku_id'], 'select': 'user_id', 'agg': count_unique, 'agg_name': 'AvgUserPerDistinct'},
    # brand interaction counts
    {'groupby': ['brand'], 'select': 'user_id', 'agg': 'count'},
    # cate counts
    {'groupby': ['cate'], 'select': 'user_id', 'agg': 'count'},
    
    
    # multiple columns
    # user personal preference
    # user & sku_id
    {'groupby': ['user_id', 'sku_id'], 'select': 'brand', 'agg': 'count'},
    # user & brand
    {'groupby': ['user_id', 'brand'], 'select': 'sku_id', 'agg': 'count'},
    # user & cate
    {'groupby': ['user_id', 'cate'], 'select': 'sku_id', 'agg': 'count'},
    
    # sex preference
    # sex & sku
    {'groupby': ['sex', 'sku_id'], 'select': 'user_id', 'agg': 'count'},
    # sex & brand
    {'groupby': ['sex', 'brand'], 'select': 'user_id', 'agg': 'count'},
    
    # age preference
    # age & sku
    {'groupby': ['age', 'sku_id'], 'select': 'user_id', 'agg': 'count'},
    # age & brand
    {'groupby': ['brand', 'sku_id'], 'select': 'user_id', 'agg': 'count'},

    
    # V2
    # cart conversion
    # 
]

In [221]:
# from its distribution
TIME_BASED = [
    # time based columns
    {'groupby': ['user_id', 'month', 'day'], 'select': 'sku_id', 'agg': 'count'},
    {'groupby': ['user_id', 'month', 'day'], 'select': 'sku_id', 'agg': 'var'},
    
    {'groupby': ['sku_id', 'month', 'day'], 'select': 'user_id', 'agg': 'count'},
    {'groupby': ['sku_id', 'month', 'day'], 'select': 'user_id', 'agg': 'var'},
    
    {'groupby': ['brand', 'month', 'day'], 'select': 'sku_id', 'agg': 'count'},
    {'groupby': ['brand', 'month', 'day'], 'select': 'sku_id', 'agg': 'var'},
    
    {'groupby': ['brand', 'month', 'day'], 'select': 'user_id', 'agg': 'count'},
    {'groupby': ['brand', 'month', 'day'], 'select': 'user_id', 'agg': 'var'},
    
    {'groupby': ['user_id', 'sku_id', 'month', 'day'], 'select': 'brand', 'agg': 'count'},
    
    {'groupby': ['user_id', 'brand', 'month', 'day'], 'select': 'sku_id', 'agg': 'count'},
]

In [311]:
# Used DataFrame: All, Browser, Purchase
def get_data_index_map(df):
    return {
        'all': None,
        'browse': df.browse == 1,
        'purchase': df.purchase == 1,
    }

def cal_unique_ratio(df, spec):
    gp = df.groupby(spec['groupby'])[spec['select']].agg(['count', 'nunique']).reset_index()
    gp[spec['select']] = gp['count'] / gp['nunique']
    return gp.drop(columns=['count', 'nunique'], axis=1)

def add_agg_features(source, target, suffix=None):
    cols = target.columns
    for spec in AGGREGATION_RULES:
        # Name of the aggregation we're applying
        agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']

        # Name of new feature
        new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])

        # Info
        print("Grouping by {}, and aggregating {} with {}".format(
            spec['groupby'], spec['select'], agg_name
        ))
        
        all_features = list(set(spec['groupby'] + [spec['select']]))
        if spec['agg'] != UNIQUE_RATIO:
            gp = source[all_features]. \
                groupby(spec['groupby'])[spec['select']]. \
                agg(spec['agg']). \
                reset_index(). \
                rename(index=str, columns={spec['select']: new_feature})
        else:
            gp = cal_unique_ratio(source, spec). \
                rename(index=str, columns={spec['select']: new_feature})
        
        target = target.merge(gp, on=spec['groupby'], how='left')
        
        # Clear memory
        del gp
        gc.collect()
        
    if suffix:
        target.columns = [c if c in cols else '{}{}'.format(c, suffix) for c in target.columns]
    return target


In [308]:
index_map = get_data_index_map(train)
rslt = []
for name, index in index_map.items():
    print('===Data Source: {}==='.format(name))
    df = train
    if index is not None:
        df = train[index]
    
    suffix = '_{}'.format(name) if index is not None else None
    df_agg_features = add_agg_features(df, train_pair, suffix)
    rslt.append(df_agg_features)
    print('='*10)

===Data Source: all===
Grouping by ['user_id'], and aggregating sku_id with count
Grouping by ['user_id'], and aggregating sku_id with AvgSkuPerDistinct
Grouping by ['age'], and aggregating sku_id with count
Grouping by ['sex'], and aggregating sku_id with count
Grouping by ['user_lv_cd'], and aggregating sku_id with count
Grouping by ['sku_id'], and aggregating user_id with count
Grouping by ['sku_id'], and aggregating user_id with AvgUserPerDistinct
Grouping by ['brand'], and aggregating user_id with count
Grouping by ['cate'], and aggregating user_id with count
Grouping by ['user_id', 'sku_id'], and aggregating brand with count
Grouping by ['user_id', 'brand'], and aggregating sku_id with count
Grouping by ['user_id', 'cate'], and aggregating sku_id with count
Grouping by ['sex', 'sku_id'], and aggregating user_id with count
Grouping by ['sex', 'brand'], and aggregating user_id with count
Grouping by ['age', 'sku_id'], and aggregating user_id with count
Grouping by ['brand', 'sku_id

In [295]:
len(rslt)

3

In [358]:
agg_merged = pd.concat(rslt, axis=1, join='inner')

In [359]:
agg_merged.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,year_user_reg,month_user_reg,day_user_reg,unix_user_reg,sku_id,cate,brand,a1,a2,a3,purchase,user_id_count_sku_id,user_id_AvgSkuPerDistinct_sku_id,age_count_sku_id,sex_count_sku_id,user_lv_cd_count_sku_id,sku_id_count_user_id,sku_id_AvgUserPerDistinct_user_id,brand_count_user_id,cate_count_user_id,user_id_sku_id_count_brand,user_id_brand_count_sku_id,user_id_cate_count_sku_id,sex_sku_id_count_user_id,sex_brand_count_user_id,age_sku_id_count_user_id,brand_sku_id_count_user_id,user_id.1,age.1,sex.1,user_lv_cd.1,year_user_reg.1,month_user_reg.1,day_user_reg.1,unix_user_reg.1,sku_id.1,cate.1,brand.1,a1.1,a2.1,a3.1,purchase.1,user_id_count_sku_id__browse,user_id_AvgSkuPerDistinct_sku_id__browse,age_count_sku_id__browse,sex_count_sku_id__browse,user_lv_cd_count_sku_id__browse,sku_id_count_user_id__browse,sku_id_AvgUserPerDistinct_user_id__browse,brand_count_user_id__browse,cate_count_user_id__browse,user_id_sku_id_count_brand__browse,user_id_brand_count_sku_id__browse,user_id_cate_count_sku_id__browse,sex_sku_id_count_user_id__browse,sex_brand_count_user_id__browse,age_sku_id_count_user_id__browse,brand_sku_id_count_user_id__browse,user_id.2,age.2,sex.2,user_lv_cd.2,year_user_reg.2,month_user_reg.2,day_user_reg.2,unix_user_reg.2,sku_id.2,cate.2,brand.2,a1.2,a2.2,a3.2,purchase.2,user_id_count_sku_id__purchase,user_id_AvgSkuPerDistinct_sku_id__purchase,age_count_sku_id__purchase,sex_count_sku_id__purchase,user_lv_cd_count_sku_id__purchase,sku_id_count_user_id__purchase,sku_id_AvgUserPerDistinct_user_id__purchase,brand_count_user_id__purchase,cate_count_user_id__purchase,user_id_sku_id_count_brand__purchase,user_id_brand_count_sku_id__purchase,user_id_cate_count_sku_id__purchase,sex_sku_id_count_user_id__purchase,sex_brand_count_user_id__purchase,age_sku_id_count_user_id__purchase,brand_sku_id_count_user_id__purchase
0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,408,6.688525,2939059,2298227,4,168,285,254,1419360,17,408,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,96.0,2.742857,136442,3457322,3332913,122.0,2.178571,800227.0,619328,1.0,39.0,62.0,74.0,392562.0,7.0,122.0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,1.0,1.0,689.0,16970,19161,,,4229.0,3294,,,,,2086.0,,
1,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,40,5.0,35535,2521240,10,31,35,26,14997,11,40,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,96.0,2.742857,136442,3457322,3332913,9.0,1.285714,10141.0,672424,2.0,6.0,7.0,6.0,4301.0,2.0,9.0,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,1.0,1.0,689.0,16970,19161,,,172.0,4932,,,,,59.0,,
2,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,26345,15.397428,2939059,2298227,7,168,285,11397,1419360,1498,26345,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,96.0,2.742857,136442,3457322,3332913,7450.0,4.431886,800227.0,619328,2.0,39.0,62.0,3262.0,392562.0,438.0,7450.0,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,1.0,1.0,689.0,16970,19161,46.0,1.069767,4229.0,3294,,,,22.0,2086.0,4.0,46.0
3,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,1695,9.064171,50523,2298227,4,4,285,749,21160,25,1695,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,96.0,2.742857,136442,3457322,3332913,429.0,2.508772,13294.0,619328,1.0,1.0,62.0,181.0,5584.0,7.0,429.0,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,1.0,1.0,689.0,16970,19161,5.0,1.0,88.0,3294,,,,4.0,48.0,,5.0
4,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,419,11.971429,492422,12629857,12595008,431,5.824324,489420,9637149,4,39,94,207,215797,4,431,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,96.0,2.742857,136442,3457322,3332913,131.0,1.77027,134361.0,2625218,1.0,9.0,26.0,56.0,61133.0,1.0,131.0,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,1.0,1.0,689.0,16970,19161,,,602.0,9350,,,1.0,,270.0,,


In [360]:
len(agg_merged)

3070668

In [361]:
# agg_merged.columns = [c.replace('__', '_') for c in agg_merged.columns]

In [362]:
agg_merged.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3070668 entries, 0 to 3070667
Data columns (total 93 columns):
user_id                                       3070668 non-null int64
age                                           3070668 non-null int64
sex                                           3070668 non-null int64
user_lv_cd                                    3070668 non-null int64
year_user_reg                                 3070668 non-null int64
month_user_reg                                3070668 non-null int64
day_user_reg                                  3070668 non-null int64
unix_user_reg                                 3070668 non-null int64
sku_id                                        3070668 non-null int64
cate                                          3070668 non-null int64
brand                                         3070668 non-null int64
a1                                            3070668 non-null int64
a2                                            3070668 non-n

In [390]:
# drop duplicated columns
def drop_duplicated_columns(df):
    distinct_cols = ~df.columns.duplicated()
    df = df.iloc[:, distinct_cols]
    return df

agg_merged = drop_duplicated_columns(agg_merged)

In [389]:
distinct_cols

Index(['user_id', 'age', 'sex', 'user_lv_cd', 'year_user_reg',
       'month_user_reg', 'day_user_reg', 'unix_user_reg', 'sku_id', 'cate',
       'brand', 'a1', 'a2', 'a3', 'purchase', 'user_id_count_sku_id',
       'user_id_AvgSkuPerDistinct_sku_id', 'age_count_sku_id',
       'sex_count_sku_id', 'user_lv_cd_count_sku_id', 'sku_id_count_user_id',
       'sku_id_AvgUserPerDistinct_user_id', 'brand_count_user_id',
       'cate_count_user_id', 'user_id_sku_id_count_brand',
       'user_id_brand_count_sku_id', 'user_id_cate_count_sku_id',
       'sex_sku_id_count_user_id', 'sex_brand_count_user_id',
       'age_sku_id_count_user_id', 'brand_sku_id_count_user_id',
       'user_id_count_sku_id_browse',
       'user_id_AvgSkuPerDistinct_sku_id_browse', 'age_count_sku_id_browse',
       'sex_count_sku_id_browse', 'user_lv_cd_count_sku_id_browse',
       'sku_id_count_user_id_browse',
       'sku_id_AvgUserPerDistinct_user_id_browse',
       'brand_count_user_id_browse', 'cate_count_user_id_br

In [364]:
%time agg_merged.to_csv('../data/interim/agg_whole_train_from_all_merged_v1.csv', index=False)

CPU times: user 1min 43s, sys: 599 ms, total: 1min 43s
Wall time: 1min 44s


In [365]:
agg_merged.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,year_user_reg,month_user_reg,day_user_reg,unix_user_reg,sku_id,cate,brand,a1,a2,a3,purchase,user_id_count_sku_id,user_id_AvgSkuPerDistinct_sku_id,age_count_sku_id,sex_count_sku_id,user_lv_cd_count_sku_id,sku_id_count_user_id,sku_id_AvgUserPerDistinct_user_id,brand_count_user_id,cate_count_user_id,user_id_sku_id_count_brand,user_id_brand_count_sku_id,user_id_cate_count_sku_id,sex_sku_id_count_user_id,sex_brand_count_user_id,age_sku_id_count_user_id,brand_sku_id_count_user_id,user_id_count_sku_id_browse,user_id_AvgSkuPerDistinct_sku_id_browse,age_count_sku_id_browse,sex_count_sku_id_browse,user_lv_cd_count_sku_id_browse,sku_id_count_user_id_browse,sku_id_AvgUserPerDistinct_user_id_browse,brand_count_user_id_browse,cate_count_user_id_browse,user_id_sku_id_count_brand_browse,user_id_brand_count_sku_id_browse,user_id_cate_count_sku_id_browse,sex_sku_id_count_user_id_browse,sex_brand_count_user_id_browse,age_sku_id_count_user_id_browse,brand_sku_id_count_user_id_browse,user_id_count_sku_id_purchase,user_id_AvgSkuPerDistinct_sku_id_purchase,age_count_sku_id_purchase,sex_count_sku_id_purchase,user_lv_cd_count_sku_id_purchase,sku_id_count_user_id_purchase,sku_id_AvgUserPerDistinct_user_id_purchase,brand_count_user_id_purchase,cate_count_user_id_purchase,user_id_sku_id_count_brand_purchase,user_id_brand_count_sku_id_purchase,user_id_cate_count_sku_id_purchase,sex_sku_id_count_user_id_purchase,sex_brand_count_user_id_purchase,age_sku_id_count_user_id_purchase,brand_sku_id_count_user_id_purchase
0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,408,6.688525,2939059,2298227,4,168,285,254,1419360,17,408,96.0,2.742857,136442,3457322,3332913,122.0,2.178571,800227.0,619328,1.0,39.0,62.0,74.0,392562.0,7.0,122.0,1.0,1.0,689.0,16970,19161,,,4229.0,3294,,,,,2086.0,,
1,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,40,5.0,35535,2521240,10,31,35,26,14997,11,40,96.0,2.742857,136442,3457322,3332913,9.0,1.285714,10141.0,672424,2.0,6.0,7.0,6.0,4301.0,2.0,9.0,1.0,1.0,689.0,16970,19161,,,172.0,4932,,,,,59.0,,
2,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,26345,15.397428,2939059,2298227,7,168,285,11397,1419360,1498,26345,96.0,2.742857,136442,3457322,3332913,7450.0,4.431886,800227.0,619328,2.0,39.0,62.0,3262.0,392562.0,438.0,7450.0,1.0,1.0,689.0,16970,19161,46.0,1.069767,4229.0,3294,,,,22.0,2086.0,4.0,46.0
3,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,419,11.971429,492422,12629857,12595008,1695,9.064171,50523,2298227,4,4,285,749,21160,25,1695,96.0,2.742857,136442,3457322,3332913,429.0,2.508772,13294.0,619328,1.0,1.0,62.0,181.0,5584.0,7.0,429.0,1.0,1.0,689.0,16970,19161,5.0,1.0,88.0,3294,,,,4.0,48.0,,5.0
4,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,419,11.971429,492422,12629857,12595008,431,5.824324,489420,9637149,4,39,94,207,215797,4,431,96.0,2.742857,136442,3457322,3332913,131.0,1.77027,134361.0,2625218,1.0,9.0,26.0,56.0,61133.0,1.0,131.0,1.0,1.0,689.0,16970,19161,,,602.0,9350,,,1.0,,270.0,,


### compare agg speed for unique ratio

原本是 lambda 比 matrix 慢，但是當數量接近 700,000 時 lambda 比較快，可能是借用記憶體的時間太長了

In [275]:
spec_lambda = {'groupby': ['user_id'], 'select': 'sku_id', 'agg': count_unique, 'agg_name': 'AvgSkuPerDistinct'}
spec_matrix = {'groupby': ['user_id'], 'select': 'sku_id', 'agg': UNIQUE_RATIO, 'agg_name': 'AvgSkuPerDistinct'}


def apply_lambda_ratio(source, spec):
    all_features = list(set(spec['groupby'] + [spec['select']]))
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), spec['agg_name'], spec['select'])
    gp = source[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
    # print(gp.tail())

def matrix_multiplication_ratio(source, spec):
    all_features = list(set(spec['groupby'] + [spec['select']]))
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), spec['agg_name'], spec['select'])
    gp = source[all_features].groupby(spec['groupby'])[spec['select']].agg(['count', 'nunique']).reset_index()
    gp[spec['select']] = gp['count'] / gp['nunique']
    gp.drop(columns=['count', 'nunique'], axis=1, inplace=True)
    # print(gp.tail())

In [None]:
# %prun apply_lambda_ratio(train, spec_lambda)

In [241]:
# %prun matrix_multiplication_ratio(train, spec_matrix)

 

In [None]:
rslt = []
for sample in [100000, 7000000, 40000000]:
    print('When sample num is {}'.format(sample))
    %time apply_lambda_ratio(train.head(sample), spec_lambda)
    %time matrix_multiplication_ratio(train.head(sample), spec_matrix)

In [252]:
%time apply_lambda_ratio(train, spec_lambda)

        user_id  user_id_AvgSkuPerDistinct_sku_id
101249   305317                         10.444444
101250   305318                          5.210526
101251   305319                          9.166667
101252   305320                          5.000000
101253   305321                          1.800000
CPU times: user 5.79 s, sys: 330 ms, total: 6.12 s
Wall time: 6.12 s


In [256]:
%time matrix_multiplication_ratio(train, spec_matrix)

        user_id     sku_id
101249   305317  10.444444
101250   305318   5.210526
101251   305319   9.166667
101252   305320   5.000000
101253   305321   1.800000
CPU times: user 17.8 s, sys: 375 ms, total: 18.2 s
Wall time: 18.2 s


In [273]:
%time apply_lambda_ratio(train.head(7000000), spec_lambda)

       user_id  user_id_AvgSkuPerDistinct_sku_id
67492   305311                         13.543860
67493   305312                          7.857143
67494   305316                          5.666667
67495   305320                          4.000000
67496   305321                          1.000000
CPU times: user 3.32 s, sys: 0 ns, total: 3.32 s
Wall time: 3.32 s


In [274]:
%time matrix_multiplication_ratio(train.head(7000000), spec_matrix)

       user_id     sku_id
67492   305311  13.543860
67493   305312   7.857143
67494   305316   5.666667
67495   305320   4.000000
67496   305321   1.000000
CPU times: user 3.41 s, sys: 48.7 ms, total: 3.46 s
Wall time: 3.45 s


In [199]:
df_agg_features.columns

Index(['user_id', 'age', 'sex', 'user_lv_cd', 'year_user_reg',
       'month_user_reg', 'day_user_reg', 'unix_user_reg', 'sku_id', 'cate',
       'brand', 'a1', 'a2', 'a3', 'purchase', 'user_id_count_sku_id',
       'user_id_AvgUniqSkuAction_sku_id', 'age_count_sku_id',
       'sex_count_sku_id', 'user_lv_cd_count_sku_id', 'sku_id_count_user_id',
       'sku_id_AvgUniqUserAction_user_id', 'brand_count_user_id',
       'user_id_sku_id_count_brand', 'user_id_sku_id_AvgUniqBrand_brand',
       'user_id_brand_count_sku_id'],
      dtype='object')

## Time Window

In [384]:
window_size = [1, 3, 5, 7, 30]

In [380]:
end_time = pd.to_datetime('2016-04-02 00:00:00')
end_time_unix = end_time.value // 10 ** 9

In [385]:
rslt = []
for size in window_size:
    print('===Window Size: {} day==='.format(size))
    start_time = end_time_unix - 60 * 60 * 24 * size
    df = train[train.unix_action >= start_time]

    suffix = '_{}d'.format(size)
    df_agg_features = add_agg_features(df, train_pair, suffix)
    rslt.append(df_agg_features)
    print('='*10)

===Window Size: 1 day===
Grouping by ['user_id'], and aggregating sku_id with count
Grouping by ['user_id'], and aggregating sku_id with AvgSkuPerDistinct
Grouping by ['age'], and aggregating sku_id with count
Grouping by ['sex'], and aggregating sku_id with count
Grouping by ['user_lv_cd'], and aggregating sku_id with count
Grouping by ['sku_id'], and aggregating user_id with count
Grouping by ['sku_id'], and aggregating user_id with AvgUserPerDistinct
Grouping by ['brand'], and aggregating user_id with count
Grouping by ['cate'], and aggregating user_id with count
Grouping by ['user_id', 'sku_id'], and aggregating brand with count
Grouping by ['user_id', 'brand'], and aggregating sku_id with count
Grouping by ['user_id', 'cate'], and aggregating sku_id with count
Grouping by ['sex', 'sku_id'], and aggregating user_id with count
Grouping by ['sex', 'brand'], and aggregating user_id with count
Grouping by ['age', 'sku_id'], and aggregating user_id with count
Grouping by ['brand', 'sku_

In [386]:
time_agg_merged = pd.concat(rslt, axis=1, join='inner')

In [387]:
len(time_agg_merged)

3070668

In [388]:
time_agg_merged.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,year_user_reg,month_user_reg,day_user_reg,unix_user_reg,sku_id,cate,brand,a1,a2,a3,purchase,user_id_count_sku_id_1d,user_id_AvgSkuPerDistinct_sku_id_1d,age_count_sku_id_1d,sex_count_sku_id_1d,user_lv_cd_count_sku_id_1d,sku_id_count_user_id_1d,sku_id_AvgUserPerDistinct_user_id_1d,brand_count_user_id_1d,cate_count_user_id_1d,user_id_sku_id_count_brand_1d,user_id_brand_count_sku_id_1d,user_id_cate_count_sku_id_1d,sex_sku_id_count_user_id_1d,sex_brand_count_user_id_1d,age_sku_id_count_user_id_1d,brand_sku_id_count_user_id_1d,user_id.1,age.1,sex.1,user_lv_cd.1,year_user_reg.1,month_user_reg.1,day_user_reg.1,unix_user_reg.1,sku_id.1,cate.1,brand.1,a1.1,a2.1,a3.1,purchase.1,user_id_count_sku_id_3d,user_id_AvgSkuPerDistinct_sku_id_3d,age_count_sku_id_3d,sex_count_sku_id_3d,user_lv_cd_count_sku_id_3d,sku_id_count_user_id_3d,sku_id_AvgUserPerDistinct_user_id_3d,brand_count_user_id_3d,cate_count_user_id_3d,user_id_sku_id_count_brand_3d,user_id_brand_count_sku_id_3d,user_id_cate_count_sku_id_3d,sex_sku_id_count_user_id_3d,sex_brand_count_user_id_3d,age_sku_id_count_user_id_3d,brand_sku_id_count_user_id_3d,user_id.2,age.2,sex.2,user_lv_cd.2,year_user_reg.2,month_user_reg.2,day_user_reg.2,unix_user_reg.2,sku_id.2,cate.2,brand.2,a1.2,a2.2,a3.2,purchase.2,user_id_count_sku_id_5d,user_id_AvgSkuPerDistinct_sku_id_5d,age_count_sku_id_5d,sex_count_sku_id_5d,user_lv_cd_count_sku_id_5d,sku_id_count_user_id_5d,sku_id_AvgUserPerDistinct_user_id_5d,brand_count_user_id_5d,cate_count_user_id_5d,user_id_sku_id_count_brand_5d,user_id_brand_count_sku_id_5d,user_id_cate_count_sku_id_5d,sex_sku_id_count_user_id_5d,sex_brand_count_user_id_5d,age_sku_id_count_user_id_5d,brand_sku_id_count_user_id_5d,user_id.3,age.3,sex.3,user_lv_cd.3,year_user_reg.3,month_user_reg.3,day_user_reg.3,unix_user_reg.3,sku_id.3,cate.3,brand.3,a1.3,a2.3,a3.3,purchase.3,user_id_count_sku_id_7d,user_id_AvgSkuPerDistinct_sku_id_7d,age_count_sku_id_7d,sex_count_sku_id_7d,user_lv_cd_count_sku_id_7d,sku_id_count_user_id_7d,sku_id_AvgUserPerDistinct_user_id_7d,brand_count_user_id_7d,cate_count_user_id_7d,user_id_sku_id_count_brand_7d,user_id_brand_count_sku_id_7d,user_id_cate_count_sku_id_7d,sex_sku_id_count_user_id_7d,sex_brand_count_user_id_7d,age_sku_id_count_user_id_7d,brand_sku_id_count_user_id_7d,user_id.4,age.4,sex.4,user_lv_cd.4,year_user_reg.4,month_user_reg.4,day_user_reg.4,unix_user_reg.4,sku_id.4,cate.4,brand.4,a1.4,a2.4,a3.4,purchase.4,user_id_count_sku_id_30d,user_id_AvgSkuPerDistinct_sku_id_30d,age_count_sku_id_30d,sex_count_sku_id_30d,user_lv_cd_count_sku_id_30d,sku_id_count_user_id_30d,sku_id_AvgUserPerDistinct_user_id_30d,brand_count_user_id_30d,cate_count_user_id_30d,user_id_sku_id_count_brand_30d,user_id_brand_count_sku_id_30d,user_id_cate_count_sku_id_30d,sex_sku_id_count_user_id_30d,sex_brand_count_user_id_30d,age_sku_id_count_user_id_30d,brand_sku_id_count_user_id_30d
0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,5.0,5.0,9730.0,236506,234367,2.0,2.0,53197.0,39680,,5.0,5.0,2.0,26897.0,,2.0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,19.0,2.111111,29766.0,693323,709920,4.0,2.0,162642.0,121555,,6.0,8.0,4.0,78389.0,,4.0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,67.0,5.583333,49145.0,1162806,1192080,15.0,3.0,284094.0,217076,,14.0,8.0,11.0,135620.0,,15.0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,130.0,8.666667,66440,1638453,1653825,23.0,3.285714,397446.0,303677,,15.0,9.0,19.0,191093.0,,23.0,200001,5,2,5,2016,1,26,1453766400,2222,9,489,-1,-1,-1,0,419.0,11.971429,317468,8364877,8167772,191.0,5.305556,1993656.0,1478115,4.0,168.0,285.0,107.0,966010.0,17.0,191.0
1,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,5.0,5.0,9730.0,236506,234367,,,441.0,38615,,,,,192.0,,,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,19.0,2.111111,29766.0,693323,709920,,,2492.0,127686,,1.0,1.0,,979.0,,,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,67.0,5.583333,49145.0,1162806,1192080,10.0,10.0,3473.0,226088,10.0,31.0,31.0,10.0,1392.0,10.0,10.0,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,130.0,8.666667,66440,1638453,1653825,14.0,7.0,4547.0,326003,10.0,31.0,31.0,14.0,1807.0,10.0,14.0,200001,5,2,5,2016,1,26,1453766400,4345,7,56,-1,-1,-1,0,419.0,11.971429,317468,8364877,8167772,31.0,6.2,18311.0,1562032,10.0,31.0,35.0,26.0,8147.0,10.0,31.0
2,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,5.0,5.0,9730.0,236506,234367,251.0,6.783784,53197.0,39680,,5.0,5.0,87.0,26897.0,12.0,251.0,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,19.0,2.111111,29766.0,693323,709920,1051.0,10.009524,162642.0,121555,,6.0,8.0,373.0,78389.0,16.0,1051.0,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,67.0,5.583333,49145.0,1162806,1192080,2636.0,13.380711,284094.0,217076,,14.0,8.0,878.0,135620.0,35.0,2636.0,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,130.0,8.666667,66440,1638453,1653825,3758.0,14.288973,397446.0,303677,,15.0,9.0,1345.0,191093.0,35.0,3758.0,200001,5,2,5,2016,1,26,1453766400,5757,9,489,-1,-1,-1,0,419.0,11.971429,317468,8364877,8167772,20195.0,15.475096,1993656.0,1478115,7.0,168.0,285.0,8763.0,966010.0,1357.0,20195.0
3,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,5.0,5.0,9730.0,236506,234367,20.0,4.0,964.0,39680,,,5.0,5.0,409.0,,20.0,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,19.0,2.111111,29766.0,693323,709920,59.0,4.916667,2224.0,121555,,,8.0,19.0,914.0,1.0,59.0,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,67.0,5.583333,49145.0,1162806,1192080,89.0,4.238095,3866.0,217076,,,8.0,38.0,1601.0,1.0,89.0,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,130.0,8.666667,66440,1638453,1653825,191.0,6.586207,5771.0,303677,,,9.0,84.0,2475.0,1.0,191.0,200001,5,2,5,2016,1,26,1453766400,10768,9,100,-1,-1,-1,0,419.0,11.971429,317468,8364877,8167772,942.0,7.983051,28986.0,1478115,4.0,4.0,285.0,429.0,11862.0,5.0,942.0
4,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,5.0,5.0,9730.0,236506,234367,20.0,5.0,8299.0,189108,,,,12.0,3753.0,,20.0,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,19.0,2.111111,29766.0,693323,709920,37.0,7.4,25406.0,567014,,2.0,5.0,12.0,10430.0,,37.0,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,67.0,5.583333,49145.0,1162806,1192080,77.0,6.416667,47798.0,952167,,2.0,23.0,20.0,20353.0,,77.0,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,130.0,8.666667,66440,1638453,1653825,81.0,6.230769,65520.0,1326305,4.0,30.0,85.0,24.0,27667.0,4.0,81.0,200001,5,2,5,2016,1,26,1453766400,14398,8,403,1,1,2,0,419.0,11.971429,317468,8364877,8167772,243.0,5.170213,341039.0,6844818,4.0,39.0,94.0,108.0,152500.0,4.0,243.0


In [391]:
time_agg_merged = drop_duplicated_columns(time_agg_merged)

In [395]:
time_agg_merged.columns.duplicated().sum()

0

In [397]:
time_agg_merged.to_csv('../data/interim/time_window_agg_train_from_all_merged_v1.csv', index=False)

## Confidence Rates for browse

## 時間衰減

## Browse or other actions after Purchase

In [193]:
# train set 中如果有購買，通常之後一定會去看（有放大 browse or other actions 的效果），
# 但是這和我們對於這次比賽的目標：根據購買前行為預測是否會購買就不相符了