In [1]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

In [2]:
inputPath = '../input/newSplited/'
outputPath = '../output/newSplited/'
! mkdir {outputPath}

mkdir: cannot create directory ‘../output/newSplited/’: File exists


In [3]:
TOPN_candidate = 100
SET = 2
featureNote = 'norm_freq'

In [4]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [5]:
def load_df(path):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(path)):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

# Generate Features

In [6]:
if SET == 1:
    train_df = load_df('../input/split_2-1_pqt/train_parquets/*')
    val_A = load_df('../input/split_2-1_pqt/test_parquets/*')
elif SET == 2:
    train_df = pd.read_parquet('../input/splited/train.parquet')
    val_A = pd.read_parquet('../input/splited/test.parquet')
elif SET == 3:
    train_df = load_df('../input/parquets/train_parquets/*')
    val_A = load_df('../input/parquets/test_parquets/*')

## Time features

In [7]:
from datetime import datetime
def timeTransfer(x):
    thisTime = datetime.fromtimestamp(x)
    return [thisTime.day, thisTime.hour, thisTime.weekday()]

In [None]:
tqdm.pandas()
train_df[['day', 'hour', 'weekday']] = pd.DataFrame(train_df.ts.progress_apply(timeTransfer).to_list())

A Jupyter Widget

In [None]:
tqdm.pandas()
val_A[['day', 'hour', 'weekday']] = pd.DataFrame(val_A.ts.progress_apply(timeTransfer).to_list())

A Jupyter Widget

In [7]:
train_val_df = pd.concat([train_df, val_A], axis=0, ignore_index=True)
train_val_df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
171638752,12899774,33035,1661723968,0
171638753,12899775,1743151,1661723970,0
171638754,12899776,548599,1661723972,0
171638755,12899777,384045,1661723976,0


In [8]:
featuresPath = outputPath + f'features/set{SET}/{featureNote}/'
try: 
    os.makedirs(featuresPath) 
except OSError as error: 
    print(error)

In [10]:
def typeCount(df, userItemType, col):
    clicked_cnt = df[df.type == 0].groupby(userItemType).agg({'type':'count'})
    carted_cnt = df[df.type == 1].groupby(userItemType).agg({'type':'count'})
    ordered_cnt = df[df.type == 2].groupby(userItemType).agg({'type':'count'})
    type_count = pd.concat([clicked_cnt, carted_cnt, ordered_cnt], axis=1).fillna(0)
    type_count.columns = col
    return type_count

## Item-frequency features

In [9]:
df = train_val_df.sort_values(['session', 'ts'])
df['d'] = df['ts'].shift(-1) - df['ts']
df['n'] = df.groupby('session').d.cumcount(ascending=False)
df.loc[df.n == 0, 'd'] = 0
df = df.groupby('aid').agg({'d': ['mean', 'std']}).fillna(-1)
df.columns = ['item_stay_mean', 'item_stay_std']
df

Unnamed: 0_level_0,item_stay_mean,item_stay_std
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5568.868421,33905.633312
1,7384.656250,29196.737132
2,1501.000000,4645.738822
3,9724.382295,48168.112929
4,23686.277027,133211.095684
...,...,...
1855598,98.857143,126.417863
1855599,21668.777778,45895.170230
1855600,32533.828571,106876.903389
1855601,11105.250000,53706.335439


In [10]:
tqdm.pandas()
item_features = train_val_df.sort_values('ts').groupby('aid').ts.progress_apply(lambda x: np.mean(np.diff(x))).to_frame().fillna(0)
item_features.columns = ['item_ts_diff_mean']

item_feature_std = train_val_df.sort_values('ts').groupby('aid').ts.progress_apply(lambda x: np.std(np.diff(x))).to_frame().fillna(0)
item_feature_std.columns = ['item_ts_diff_std']

item_features = pd.concat([item_features, item_feature_std, df], axis=1).fillna(0)
item_features.to_parquet(featuresPath + 'item_freq_features.pqt')
item_features

A Jupyter Widget

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


A Jupyter Widget

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0_level_0,item_ts_diff_mean,item_ts_diff_std,item_stay_mean,item_stay_std
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,63894.810811,80846.928396,5568.868421,33905.633312
1,33884.838710,63918.176627,7384.656250,29196.737132
2,150775.933333,230300.425909,1501.000000,4645.738822
3,1554.865486,9831.047509,9724.382295,48168.112929
4,15750.340136,28963.849178,23686.277027,133211.095684
...,...,...,...,...
1855598,266356.666667,171849.015451,98.857143,126.417863
1855599,169401.000000,95984.799755,21668.777778,45895.170230
1855600,27742.898551,42591.153877,32533.828571,106876.903389
1855601,18079.131868,61205.834523,11105.250000,53706.335439


## Item Features

In [41]:
item_features = train_val_df.groupby('aid').agg({'aid':'count', 'session':'nunique', 'type':['mean', 'std'], 'ts': ['mean', 'std', 'min', 'max']}) #, 'day': ['mean', 'std'], 'hour': ['mean', 'std'], 'weekday': ['mean', 'std']})
item_features.columns = ['item_item_count', 'item_user_count', 'item_buy_ratio', 'item_type_std', 'item_ts_mean', 'item_ts_std', 'item_ts_min', 'item_ts_max']#, \
     # 'item_day_mean', 'item_day_std','item_hour_mean', 'item_hour_std', 'item_weekday_mean', 'item_weekday_std']
item_features

Unnamed: 0_level_0,item_item_count,item_user_count,item_buy_ratio,item_type_std,item_type_median,item_ts_mean,item_ts_std,item_ts_min,item_ts_max
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,48,40,0.000000,0.000000,0.0,1.660616e+09,867209.703096,1659345217,1662318801
1,34,30,0.029412,0.171499,0.0,1.660429e+09,345644.457236,1659719430,1661422366
2,17,16,0.000000,0.000000,0.0,1.660758e+09,545397.956868,1659342437,1661604076
3,2759,1392,0.104023,0.350668,0.0,1.661224e+09,429296.381528,1659352803,1662194081
4,221,143,0.040724,0.198099,0.0,1.660734e+09,767694.809076,1659363904,1662305768
...,...,...,...,...,...,...,...,...,...
1855598,7,7,0.000000,0.000000,0.0,1.660585e+09,564915.780663,1659861895,1661460035
1855599,14,12,0.000000,0.000000,0.0,1.660730e+09,666984.413481,1659682918,1661773989
1855600,91,57,0.098901,0.335160,0.0,1.660588e+09,698821.329180,1659350409,1662039871
1855601,92,62,0.076087,0.266590,0.0,1.659917e+09,315167.866908,1659325875,1660971076


In [42]:
item_type_count = typeCount(train_val_df, 'aid', col=['item_clicked_cnt', 'item_carted_cnt', 'item_ordered_cnt'])
item_type_count

Unnamed: 0_level_0,item_clicked_cnt,item_carted_cnt,item_ordered_cnt
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,48,0.0,0.0
1,33,1.0,0.0
2,17,0.0,0.0
3,2513,205.0,41.0
4,212,9.0,0.0
...,...,...,...
1855598,7,0.0,0.0
1855599,14,0.0,0.0
1855600,83,7.0,1.0
1855601,85,7.0,0.0


In [43]:
item_features = pd.concat([item_features, item_type_count], axis=1).fillna(0)

In [44]:
item_features_inValA = val_A.groupby('aid').agg({'aid':'count', 'session':'nunique', 'type':['mean', 'std'], 'ts': ['mean', 'std', 'min']})
item_features_inValA.columns = ['item_item_count_valA', 'item_user_count_valA', 'item_buy_ratio_valA', 'item_type_std_valA', 'item_ts_mean_valA', 'item_ts_std_valA', 'item_ts_min_valA']
item_features_inValA

Unnamed: 0_level_0,item_item_count_valA,item_user_count_valA,item_buy_ratio_valA,item_type_std_valA,item_type_median_valA,item_ts_mean_valA,item_ts_std_valA,item_ts_min_valA
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,4,4,0.000000,0.000000,0.0,1.662143e+09,259687.115757,1661762340
3,202,115,0.123762,0.372606,0.0,1.661859e+09,77489.258060,1661727711
4,8,5,0.000000,0.000000,0.0,1.662159e+09,175355.149301,1661810405
6,1,1,0.000000,,0.0,1.661764e+09,,1661763517
7,1,1,0.000000,,0.0,1.662029e+09,,1662028769
...,...,...,...,...,...,...,...,...
1855592,5,5,0.000000,0.000000,0.0,1.662109e+09,181826.508856,1661943492
1855593,17,14,0.058824,0.242536,0.0,1.662156e+09,130894.843797,1661797387
1855594,19,10,0.052632,0.229416,0.0,1.662052e+09,161917.137647,1661836068
1855599,1,1,0.000000,,0.0,1.661774e+09,,1661773989


In [47]:
item_features = item_features.merge(item_features_inValA, on='aid', how='left').fillna(0)
item_features

Unnamed: 0_level_0,item_item_count,item_user_count,item_buy_ratio,item_type_std,item_type_median,item_ts_mean,item_ts_std,item_ts_min,item_ts_max,item_clicked_cnt,...,item_user_count_valA,item_buy_ratio_valA,item_type_std_valA,item_type_median_valA,item_ts_mean_valA,item_ts_std_valA,item_ts_min_valA,item_clicked_cnt_val,item_carted_cnt_val,item_ordered_cnt_val
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,48,40,0.000000,0.000000,0.0,1.660616e+09,867209.703096,1659345217,1662318801,48,...,4.0,0.000000,0.000000,0.0,1.662143e+09,259687.115757,1.661762e+09,4.0,0.0,0.0
1,34,30,0.029412,0.171499,0.0,1.660429e+09,345644.457236,1659719430,1661422366,33,...,0.0,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0
2,17,16,0.000000,0.000000,0.0,1.660758e+09,545397.956868,1659342437,1661604076,17,...,0.0,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0
3,2759,1392,0.104023,0.350668,0.0,1.661224e+09,429296.381528,1659352803,1662194081,2513,...,115.0,0.123762,0.372606,0.0,1.661859e+09,77489.258060,1.661728e+09,180.0,19.0,3.0
4,221,143,0.040724,0.198099,0.0,1.660734e+09,767694.809076,1659363904,1662305768,212,...,5.0,0.000000,0.000000,0.0,1.662159e+09,175355.149301,1.661810e+09,8.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,7,7,0.000000,0.000000,0.0,1.660585e+09,564915.780663,1659861895,1661460035,7,...,0.0,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0
1855599,14,12,0.000000,0.000000,0.0,1.660730e+09,666984.413481,1659682918,1661773989,14,...,1.0,0.000000,0.000000,0.0,1.661774e+09,0.000000,1.661774e+09,1.0,0.0,0.0
1855600,91,57,0.098901,0.335160,0.0,1.660588e+09,698821.329180,1659350409,1662039871,83,...,1.0,0.500000,0.707107,0.5,1.662040e+09,3.535534,1.662040e+09,1.0,1.0,0.0
1855601,92,62,0.076087,0.266590,0.0,1.659917e+09,315167.866908,1659325875,1660971076,85,...,0.0,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0


In [48]:
## Normalized
normalizedCol = [0, 1, 9, 10, 11, 12, 13, 20, 21, 22]
timeCol = [5, 7, 8]
timeCol_val = [17, 19]
item_features.iloc[:, normalizedCol] = item_features.iloc[:, normalizedCol] - item_features.iloc[:, normalizedCol].min()
item_features.iloc[:, normalizedCol] = item_features.iloc[:, normalizedCol] / item_features.iloc[:, normalizedCol].max()
item_features.iloc[:, timeCol] = item_features.iloc[:, timeCol] - item_features.iloc[:, 8].min()
item_features.iloc[:, timeCol_val] = item_features.iloc[:, timeCol_val] - item_features.iloc[:, 19].min()
item_features = item_features.astype('float32')
item_features

  item_features.iloc[:, normalizedCol] = item_features.iloc[:, normalizedCol] - item_features.iloc[:, normalizedCol].min()


Unnamed: 0_level_0,item_item_count,item_user_count,item_buy_ratio,item_type_std,item_type_median,item_ts_mean,item_ts_std,item_ts_min,item_ts_max,item_clicked_cnt,...,item_user_count_valA,item_buy_ratio_valA,item_type_std_valA,item_type_median_valA,item_ts_mean_valA,item_ts_std_valA,item_ts_min_valA,item_clicked_cnt_val,item_carted_cnt_val,item_ordered_cnt_val
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000326,0.000536,0.000000,0.000000,0.0,1.310580e+06,867209.68750,40073.0,3013657.0,0.000347,...,0.000716,0.000000,0.000000,0.0,1.662143e+09,259687.109375,1.661762e+09,0.000481,0.000000,0.000000
1,0.000225,0.000398,0.029412,0.171499,0.0,1.123860e+06,345644.46875,414286.0,2117222.0,0.000231,...,0.000000,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,0.000000
2,0.000102,0.000206,0.000000,0.000000,0.0,1.452952e+06,545397.93750,37293.0,2298932.0,0.000108,...,0.000000,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,0.000000
3,0.019990,0.019109,0.104023,0.350668,0.0,1.919124e+06,429296.37500,47659.0,2888937.0,0.019367,...,0.020587,0.123762,0.372606,0.0,1.661859e+09,77489.257812,1.661728e+09,0.021640,0.009495,0.040541
4,0.001581,0.001951,0.040724,0.198099,0.0,1.428526e+06,767694.81250,58760.0,3000624.0,0.001613,...,0.000895,0.000000,0.000000,0.0,1.662159e+09,175355.156250,1.661810e+09,0.000962,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,0.000029,0.000082,0.000000,0.000000,0.0,1.279828e+06,564915.75000,556751.0,2154891.0,0.000031,...,0.000000,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,0.000000
1855599,0.000080,0.000151,0.000000,0.000000,0.0,1.424716e+06,666984.43750,377774.0,2468845.0,0.000085,...,0.000179,0.000000,0.000000,0.0,1.661774e+09,0.000000,1.661774e+09,0.000120,0.000000,0.000000
1855600,0.000638,0.000769,0.098901,0.335160,0.0,1.282825e+06,698821.31250,45265.0,2734727.0,0.000617,...,0.000179,0.500000,0.707107,0.5,1.662040e+09,3.535534,1.662040e+09,0.000120,0.000500,0.000000
1855601,0.000646,0.000838,0.076087,0.266590,0.0,6.113956e+05,315167.87500,20731.0,1665932.0,0.000633,...,0.000000,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,0.000000


In [49]:
item_features.to_parquet(featuresPath + 'item_features.pqt')

## User-frequency features

In [11]:
tqdm.pandas()
user_features = val_A.sort_values('ts').groupby('session').ts.progress_apply(lambda x: np.mean(np.diff(x))).to_frame().fillna(0)
user_features.columns = ['user_ts_diff_mean']

user_features_std = val_A.sort_values('ts').groupby('session').ts.progress_apply(lambda x: np.std(np.diff(x))).to_frame().fillna(0)
user_features_std.columns = ['user_ts_diff_std']

user_features = pd.concat([user_features, user_features_std], axis=1).fillna(0)
user_features.to_parquet(featuresPath + 'user_freq_features.pqt')
user_features

A Jupyter Widget

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


A Jupyter Widget

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0_level_0,user_ts_diff_mean,user_ts_diff_std
session,Unnamed: 1_level_1,Unnamed: 2_level_1
11098528,0.00000,0.000000
11098529,0.00000,0.000000
11098530,266.40000,254.365564
11098531,23.73913,40.235413
11098532,795.00000,0.000000
...,...,...
12899774,0.00000,0.000000
12899775,0.00000,0.000000
12899776,0.00000,0.000000
12899777,0.00000,0.000000


## User features

In [50]:
user_features = val_A.groupby('session').agg({'session':'count', 'aid':['nunique', 'last'], 'type':['mean', 'std'], 'ts':['mean', 'std', 'min', 'max']})#, 'day': ['mean', 'std'], 'hour': ['mean', 'std'], 'weekday': ['mean', 'std']}).fillna(0)
user_features.columns = ['user_user_count', 'user_item_count', 'user_lastAid', 'user_buy_ratio', 'user_type_std', 'user_ts_mean', 'user_ts_std', 'user_ts_min', 'user_ts_max']#, 'user_day_mean', 'user_day_std', 'user_hour_mean', 'user_hour_std', 'user_weekday_mean', 'user_weekday_std']
user_features

Unnamed: 0_level_0,user_user_count,user_item_count,user_lastAid,user_buy_ratio,user_type_std,user_type_median,user_ts_mean,user_ts_std,user_ts_min,user_ts_max
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12899779,1,1,59625,0.000000,,0.0,1.661724e+09,,1661724000,1661724000
12899780,5,4,1142000,0.000000,0.000000,0.0,1.661724e+09,62.875273,1661724000,1661724155
12899781,11,5,918667,0.090909,0.301511,0.0,1.661839e+09,145156.110657,1661724000,1662060160
12899782,70,38,1007613,0.457143,0.695445,0.0,1.661784e+09,27593.219297,1661724000,1661803953
12899783,11,9,1817895,0.000000,0.000000,0.0,1.661770e+09,92951.660945,1661724000,1662041140
...,...,...,...,...,...,...,...,...,...,...
14571577,1,1,1141710,0.000000,,0.0,1.662329e+09,,1662328774,1662328774
14571578,1,1,519105,0.000000,,0.0,1.662329e+09,,1662328775,1662328775
14571579,1,1,739876,0.000000,,0.0,1.662329e+09,,1662328775,1662328775
14571580,1,1,202353,0.000000,,0.0,1.662329e+09,,1662328781,1662328781


In [51]:
user_type_count = typeCount(val_A, 'session', col=['user_clicked_cnt', 'user_carted_cnt', 'user_ordered_cnt'])
user_type_count

Unnamed: 0_level_0,user_clicked_cnt,user_carted_cnt,user_ordered_cnt
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12899779,1.0,0.0,0.0
12899780,5.0,0.0,0.0
12899781,10.0,1.0,0.0
12899782,46.0,16.0,8.0
12899783,11.0,0.0,0.0
...,...,...,...
14538485,0.0,0.0,1.0
14548594,0.0,0.0,1.0
14549316,0.0,0.0,1.0
14559742,0.0,0.0,1.0


In [52]:
user_features = pd.concat([user_features, user_type_count], axis=1).fillna(0)
user_features = user_features.astype('float32')
user_features

Unnamed: 0_level_0,user_user_count,user_item_count,user_lastAid,user_buy_ratio,user_type_std,user_type_median,user_ts_mean,user_ts_std,user_ts_min,user_ts_max,user_clicked_cnt,user_carted_cnt,user_ordered_cnt
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
12899779,1.0,1.0,59625.0,0.000000,0.000000,0.0,1.661724e+09,0.000000,1.661724e+09,1.661724e+09,1.0,0.0,0.0
12899780,5.0,4.0,1142000.0,0.000000,0.000000,0.0,1.661724e+09,62.875275,1.661724e+09,1.661724e+09,5.0,0.0,0.0
12899781,11.0,5.0,918667.0,0.090909,0.301511,0.0,1.661839e+09,145156.109375,1.661724e+09,1.662060e+09,10.0,1.0,0.0
12899782,70.0,38.0,1007613.0,0.457143,0.695445,0.0,1.661784e+09,27593.218750,1.661724e+09,1.661804e+09,46.0,16.0,8.0
12899783,11.0,9.0,1817895.0,0.000000,0.000000,0.0,1.661770e+09,92951.664062,1.661724e+09,1.662041e+09,11.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14571577,1.0,1.0,1141710.0,0.000000,0.000000,0.0,1.662329e+09,0.000000,1.662329e+09,1.662329e+09,1.0,0.0,0.0
14571578,1.0,1.0,519105.0,0.000000,0.000000,0.0,1.662329e+09,0.000000,1.662329e+09,1.662329e+09,1.0,0.0,0.0
14571579,1.0,1.0,739876.0,0.000000,0.000000,0.0,1.662329e+09,0.000000,1.662329e+09,1.662329e+09,1.0,0.0,0.0
14571580,1.0,1.0,202353.0,0.000000,0.000000,0.0,1.662329e+09,0.000000,1.662329e+09,1.662329e+09,1.0,0.0,0.0


In [53]:
normalizedCol = [0, 1, -1, -2, -3]
user_features.iloc[:, normalizedCol] = user_features.iloc[:, normalizedCol] - user_features.iloc[:, normalizedCol].min()
user_features.iloc[:, normalizedCol] = user_features.iloc[:, normalizedCol] / user_features.iloc[:, normalizedCol].max()
user_features[['user_ts_mean', 'user_ts_min', 'user_ts_max']] = user_features[['user_ts_mean', 'user_ts_min', 'user_ts_max']] - user_features['user_ts_min'].min()
user_features

Unnamed: 0_level_0,user_user_count,user_item_count,user_lastAid,user_buy_ratio,user_type_std,user_type_median,user_ts_mean,user_ts_std,user_ts_min,user_ts_max,user_clicked_cnt,user_carted_cnt,user_ordered_cnt
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
12899779,0.000000,0.000000,59625.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.002309,0.000000,0.000000
12899780,0.008753,0.006944,1142000.0,0.000000,0.000000,0.0,0.0,62.875275,0.0,128.0,0.011547,0.000000,0.000000
12899781,0.021882,0.009259,918667.0,0.090909,0.301511,0.0,114560.0,145156.109375,0.0,336128.0,0.023095,0.004566,0.000000
12899782,0.150985,0.085648,1007613.0,0.457143,0.695445,0.0,59648.0,27593.218750,0.0,79872.0,0.106236,0.073059,0.190476
12899783,0.021882,0.018519,1817895.0,0.000000,0.000000,0.0,46336.0,92951.664062,0.0,317056.0,0.025404,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14571577,0.000000,0.000000,1141710.0,0.000000,0.000000,0.0,604800.0,0.000000,604800.0,604800.0,0.002309,0.000000,0.000000
14571578,0.000000,0.000000,519105.0,0.000000,0.000000,0.0,604800.0,0.000000,604800.0,604800.0,0.002309,0.000000,0.000000
14571579,0.000000,0.000000,739876.0,0.000000,0.000000,0.0,604800.0,0.000000,604800.0,604800.0,0.002309,0.000000,0.000000
14571580,0.000000,0.000000,202353.0,0.000000,0.000000,0.0,604800.0,0.000000,604800.0,604800.0,0.002309,0.000000,0.000000


In [54]:
user_features.to_parquet(featuresPath + 'user_features.pqt')

## User Item interaction feature

In [55]:
userItem_features = typeCount(val_A, ['session', 'aid'], ['cnt_clicked', 'cnt_carted', 'cnt_ordered'])
userItem_features = userItem_features.astype('int8')
userItem_features

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_clicked,cnt_carted,cnt_ordered
session,aid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12899779,59625,1,0,0
12899780,582732,1,0,0
12899780,736515,1,0,0
12899780,973453,1,0,0
12899780,1142000,2,0,0
...,...,...,...,...
14567528,172423,0,0,1
14567528,1708491,0,0,1
14568250,422075,0,0,1
14568250,471339,0,0,1


In [56]:
userItem_features.to_parquet(featuresPath + 'userItem_features.pqt')