In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read Items
item_df = pd.read_csv('data/train/item_data.csv', index_col='item_id')
item_df.head()

Unnamed: 0_level_0,item_price,category_1,category_2,category_3,product_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26880,4602,11,35,20,3040
54939,3513,12,57,85,6822
40383,825,17,8,279,1619
8777,2355,13,58,189,5264
113705,1267,17,39,151,10239


In [3]:
# Reading view logs
user_log_df = pd.read_csv('data/train/view_log.csv', parse_dates=['server_time'])
user_log_df.device_type = user_log_df.device_type.astype('category')
user_log_df.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
0,2018-10-15 08:58:00,android,112333,4557,32970
1,2018-10-15 08:58:00,android,503590,74788,7640
2,2018-10-15 08:58:00,android,573960,23628,128855
3,2018-10-15 08:58:00,android,121691,2430,12774
4,2018-10-15 08:58:00,android,218564,19227,28296


In [4]:
# Merging items and view_logs
expanded_user_logs = pd.merge(user_log_df, item_df, how='inner', on='item_id')
expanded_user_logs.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id,item_price,category_1,category_2,category_3,product_type
0,2018-10-15 08:58:00,android,112333,4557,32970,54685,16,56,253,3184
1,2018-10-15 09:36:00,android,783457,88320,32970,54685,16,56,253,3184
2,2018-10-15 10:59:00,android,6902,1711,32970,54685,16,56,253,3184
3,2018-10-15 11:31:00,android,61138,58906,32970,54685,16,56,253,3184
4,2018-10-15 12:03:00,android,441653,64221,32970,54685,16,56,253,3184


In [5]:
def most_frequent(x):
    return x.value_counts().index[0]

In [10]:
aggregator = {
    'item_price': [np.max, np.min],
    'item_id': ['count','nunique'],
    'session_id': 'nunique',
    'product_type': most_frequent,
    'category_1': most_frequent,
    'category_2': most_frequent,
    'category_3': most_frequent,
    'device_type': most_frequent
}

In [11]:
cols = ['max_price','min_price','total_items','total_unique_items','total_sessions','freq_product_type'
        ,'freq_category_1','freq_category_2','freq_category_3','freq_device']
user_details = expanded_user_logs.groupby('user_id').agg(aggregator)
user_details.columns = cols

for col in ['freq_product_type','freq_category_1','freq_category_2','freq_category_3','freq_device']:
    user_details[col] = user_details[col].astype('category')

user_details['freq_device'] = user_details['freq_device'].replace({'android':0,'iphone':1,'web':2})

user_details.head()

Unnamed: 0_level_0,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,92160,332,42,18,11,7363,14,29,199,0
1,12595,383,8,8,3,7391,11,76,147,0
2,281536,128,165,130,37,8127,1,42,279,0
3,16640,537,8,3,1,6659,14,62,157,0
4,58252,1977,2,2,1,4426,14,3,27,0


In [12]:
user_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89152 entries, 0 to 92586
Data columns (total 10 columns):
max_price             89152 non-null int64
min_price             89152 non-null int64
total_items           89152 non-null int64
total_unique_items    89152 non-null int64
total_sessions        89152 non-null int64
freq_product_type     89152 non-null category
freq_category_1       89152 non-null category
freq_category_2       89152 non-null category
freq_category_3       89152 non-null category
freq_device           89152 non-null object
dtypes: category(4), int64(5), object(1)
memory usage: 5.5+ MB


In [13]:
user_details.to_csv('data/train/user_details.csv')

In [47]:
user_details = pd.read_csv('data/train/user_details.csv', index_col='user_id').reset_index()
user_details.head()

Unnamed: 0,user_id,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device
0,0,92160,332,42,18,11,7363,14,29,199,0
1,1,12595,383,8,8,3,7391,11,76,147,0
2,2,281536,128,165,130,37,8127,1,42,279,0
3,3,16640,537,8,3,1,6659,14,62,157,0
4,4,58252,1977,2,2,1,4426,14,3,27,0


In [2]:
training_data = pd.read_csv('data/train/train.csv', index_col='impression_id', parse_dates=['impression_time'])
training_data.os_version = training_data.os_version.replace({'old':0,'intermediate':1,'latest':2})
training_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,0,0,0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,2,1,1
70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,1,1,0
8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,2,1,0
182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,2,0,0


In [3]:
app_code_wise_history = training_data.set_index(['user_id','app_code','impression_time']).sort_index()
app_code_wise_history = app_code_wise_history[['is_click']]
app_code_wise_history.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,is_click
user_id,app_code,impression_time,Unnamed: 3_level_1
0,207,2018-11-26 23:30:00,0
2,190,2018-11-19 20:49:00,0
2,190,2018-11-20 20:29:00,0
2,190,2018-11-20 20:53:00,0
2,190,2018-11-21 21:47:00,0


In [31]:
general_history = training_data.set_index(['user_id','impression_time']).sort_index()
general_history = general_history[['is_click']]
general_history.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,is_click
user_id,impression_time,Unnamed: 2_level_1
0,2018-11-26 23:30:00,0
2,2018-11-19 20:49:00,0
2,2018-11-20 20:29:00,0
2,2018-11-20 20:53:00,0
2,2018-11-21 21:47:00,0


In [8]:
def calculate_click_ratio(records):
    total_ads = records.shape[0]
    total_clicks = records.is_click.sum()
    return (total_clicks / total_ads) if total_ads != 0 else 0.0

In [41]:
def calculate_app_wise_click_ratio(row):
    try:
        records = app_code_wise_history.loc[(row.user_id, row.app_code), :][:row.impression_time][:-1]
        return calculate_click_ratio(records)
    except KeyError:
        return 0.0

In [42]:
def calculate_general_click_ratio(row):
    try:
        records = general_history.loc[row.user_id, :][:row.impression_time][:-1]
        return calculate_click_ratio(records)
    except KeyError:
        return 0.0

In [34]:
training_data['app_click_ratio'] = training_data.apply(calculate_app_wise_click_ratio, axis=1)
training_data['overall_click_ratio'] = training_data.apply(calculate_general_click_ratio, axis=1)
training_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,app_click_ratio,overall_click_ratio
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,0,0,0,0.0,0.0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,2,1,1,0.0,0.0
70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,1,1,0,0.0,0.0
8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,2,1,0,0.0,0.0
182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,2,0,0,0.0,0.0


In [50]:
expanded_training_data = training_data.reset_index().merge(user_details, how='left', on='user_id')
expanded_training_data = expanded_training_data.set_index('impression_id')
expanded_training_data = expanded_training_data.fillna(0)
expanded_training_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,0,0,0,0.0,0.0,2350.0,2350.0,1.0,1.0,1.0,5622.0,11.0,35.0,20.0,0.0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,2,1,1,0.0,0.0,14166.0,1024.0,12.0,7.0,7.0,577.0,4.0,74.0,292.0,0.0
70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,1,1,0,0.0,0.0,2224.0,973.0,2.0,2.0,2.0,2874.0,13.0,67.0,139.0,0.0
8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,2,1,0,0.0,0.0,93568.0,249.0,18.0,12.0,7.0,7093.0,14.0,61.0,159.0,0.0
182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,2,0,0,0.0,0.0,117376.0,288.0,46.0,34.0,24.0,1961.0,17.0,8.0,279.0,0.0


In [51]:
expanded_training_data.to_csv('data/train/train_with_feature.csv')

In [39]:
testing_data = pd.read_csv('data/test/test.csv', index_col='impression_id', parse_dates=['impression_time'])
testing_data.os_version = testing_data.os_version.replace({'old':0,'intermediate':1,'latest':2})
testing_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,2,1
caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,2,0
13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,2,1
39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,2,1
bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,2,1


In [43]:
testing_data['app_click_ratio'] = testing_data.apply(calculate_app_wise_click_ratio, axis=1)
testing_data['overall_click_ratio'] = testing_data.apply(calculate_general_click_ratio, axis=1)
testing_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,app_click_ratio,overall_click_ratio
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,2,1,0.0,0.0
caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,2,0,0.0,0.0
13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,2,1,0.0,0.0
39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,2,1,0.0,0.0
bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,2,1,0.210526,0.210526


In [52]:
expanded_testing_data = testing_data.reset_index().merge(user_details, how='left', on='user_id')
expanded_testing_data = expanded_testing_data.set_index('impression_id')
expanded_testing_data = expanded_testing_data.fillna(0)
expanded_testing_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,2,1,0.0,0.0,23424.0,23424.0,1.0,1.0,1.0,1617.0,1.0,42.0,220.0,0.0
caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,2,0,0.0,0.0,54681.0,207.0,72.0,46.0,25.0,2637.0,1.0,42.0,157.0,0.0
13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,2,1,0.0,0.0,20309.0,1238.0,9.0,7.0,6.0,2111.0,9.0,46.0,4.0,0.0
39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,2,1,0.0,0.0,62976.0,249.0,37.0,17.0,12.0,2455.0,9.0,73.0,159.0,0.0
bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,2,1,0.210526,0.210526,43392.0,435.0,28.0,26.0,18.0,5000.0,11.0,35.0,148.0,0.0


In [53]:
expanded_testing_data.to_csv('data/test/test_with_feature.csv')

In [12]:

expanded_training_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,0,0,0,0.0,0.0,2350.0,2350.0,1.0,1.0,1.0,5622.0,11.0,35.0,20.0,0.0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,2,1,1,0.0,0.0,14166.0,1024.0,12.0,7.0,7.0,577.0,4.0,74.0,292.0,0.0
70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,1,1,0,0.0,0.0,2224.0,973.0,2.0,2.0,2.0,2874.0,13.0,67.0,139.0,0.0
8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,2,1,0,0.0,0.0,93568.0,249.0,18.0,12.0,7.0,7093.0,14.0,61.0,159.0,0.0
182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,2,0,0,0.0,0.0,117376.0,288.0,46.0,34.0,24.0,1961.0,17.0,8.0,279.0,0.0


In [54]:
idealTime = pd.Timestamp('2017-11-15 00:00:00')
def last_ad_seen(row):
    try:
        records = general_history.loc[row.user_id, :][:row.impression_time][-3:-2]
        last_seen = records.index if len(records) > 0 else idealTime
        return (pd.Timestamp(row.impression_time) - last_seen).total_seconds()
    except KeyError:
        return (pd.Timestamp(row.impression_time) - idealTime).total_seconds()

In [53]:
records = general_history.loc[84559, :][:'2018-11-21 00:27:00'][-3:-2]
# records = general_history.loc[87862, :][:'2017-11-15 00:00:00'][-3:-2]
(pd.Timestamp('2018-11-21 00:27:00') - records.index).total_seconds()

Float64Index([2280.0], dtype='float64', name='impression_time')

In [55]:
expanded_training_data = pd.read_csv('data/train/train_with_feature.csv', index_col='impression_id', 
                          parse_dates=['impression_time'])
expanded_training_data['last_ad_seen'] = expanded_training_data.apply(last_ad_seen, axis=1)
expanded_training_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device,has_seen_before,last_ad_seen
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,0,0,0,0.0,0.0,2350.0,2350.0,1.0,1.0,1.0,5622.0,11.0,35.0,20.0,0.0,0,31536000.0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,2,1,1,0.0,0.0,14166.0,1024.0,12.0,7.0,7.0,577.0,4.0,74.0,292.0,0.0,0,31536100.0
70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,1,1,0,0.0,0.0,2224.0,973.0,2.0,2.0,2.0,2874.0,13.0,67.0,139.0,0.0,0,31536100.0
8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,2,1,0,0.0,0.0,93568.0,249.0,18.0,12.0,7.0,7093.0,14.0,61.0,159.0,0.0,0,31536100.0
182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,2,0,0,0.0,0.0,117376.0,288.0,46.0,34.0,24.0,1961.0,17.0,8.0,279.0,0.0,0,31536100.0


In [22]:
expanded_training_data[expanded_training_data.has_seen_before == 1].head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device,has_seen_before
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
7647966b7343c29048673252e490f736,2018-11-15 00:06:00,62357,385,2,0,0,0.0,0.0,42880.0,320.0,69.0,35.0,19.0,5358.0,13.0,58.0,173.0,0.0,1
73278a4a86960eeb576a8fd4c9ec6997,2018-11-15 00:08:00,71748,259,1,1,0,0.0,0.0,2224.0,973.0,2.0,2.0,2.0,2874.0,13.0,67.0,139.0,0.0,1
4c56ff4ce4aaf9573aa5dff913df997a,2018-11-15 00:09:00,75546,275,0,0,0,0.0,0.0,133120.0,384.0,20.0,14.0,9.0,578.0,12.0,57.0,171.0,0.0,1
d1f491a404d6854880943e5c3cd9ca25,2018-11-15 00:09:00,4238,371,2,0,0,0.0,0.0,766720.0,80.0,126.0,61.0,41.0,231.0,17.0,40.0,84.0,0.0,1
3636638817772e42b59d74cff571fbb3,2018-11-15 00:12:00,43537,469,2,0,0,0.0,0.0,33216.0,121.0,20.0,18.0,10.0,8310.0,9.0,42.0,27.0,0.0,1


In [56]:
expanded_training_data.to_csv('data/train/train_with_feature.csv')

In [57]:
expanded_testing_data = pd.read_csv('data/test/test_with_feature.csv', index_col='impression_id', 
                          parse_dates=['impression_time'])
expanded_testing_data['last_ad_seen'] = expanded_testing_data.apply(last_ad_seen, axis=1)
expanded_testing_data.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device,has_seen_before,last_ad_seen
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,2,1,0.0,0.0,23424.0,23424.0,1.0,1.0,1.0,1617.0,1.0,42.0,220.0,0.0,0,3.3983e+07
caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,2,0,0.0,0.0,54681.0,207.0,72.0,46.0,25.0,2637.0,1.0,42.0,157.0,0.0,0,[1210200.0]
13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,2,1,0.0,0.0,20309.0,1238.0,9.0,7.0,6.0,2111.0,9.0,46.0,4.0,0.0,0,3.39832e+07
39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,2,1,0.0,0.0,62976.0,249.0,37.0,17.0,12.0,2455.0,9.0,73.0,159.0,0.0,0,[818400.0]
bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,2,1,0.210526,0.210526,43392.0,435.0,28.0,26.0,18.0,5000.0,11.0,35.0,148.0,0.0,0,[1109280.0]


In [58]:
expanded_testing_data.to_csv('data/test/test_with_feature.csv')

In [60]:
expanded_training_data.sort_values('last_ad_seen', ).tail()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_category_1,freq_category_2,freq_category_3,freq_device,has_seen_before,last_ad_seen
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
e6baabfc905146aa995c29e4b299c82e,2018-12-13 07:21:00,51756,207,2,0,0,0.0,0.0,2355.0,2355.0,1.0,1.0,1.0,2874.0,13.0,67.0,170.0,0.0,0,33981700.0
dee7f0842c44025588c2360042eef2cf,2018-12-13 07:22:00,89765,207,2,1,0,0.0,0.0,182144.0,505.0,24.0,21.0,15.0,6835.0,1.0,42.0,292.0,0.0,0,33981700.0
d5933a70c5da3152b798aa66e3bde144,2018-12-13 07:27:00,43386,386,1,0,0,0.0,0.0,287936.0,128.0,141.0,97.0,20.0,2874.0,9.0,42.0,62.0,0.0,0,33982000.0
71b3ab4d5d64a2c7626c2db5370037e6,2018-12-13 07:28:00,2699,207,1,0,0,0.0,0.0,27201.0,249.0,29.0,18.0,14.0,7640.0,1.0,59.0,180.0,0.0,0,33982100.0
13e160b8955fdfa6609b7c417435a208,2018-12-13 07:31:00,42311,296,2,0,0,0.0,0.0,95872.0,64.0,202.0,113.0,38.0,907.0,9.0,57.0,18.0,0.0,0,33982300.0


In [11]:
previous_time = pd.Timestamp('2018-11-20 20:53:00') - pd.Timedelta('20 min')
app_code_wise_history.loc[(2,190),:][previous_time:'2018-11-20 20:53:00']

Unnamed: 0_level_0,is_click
impression_time,Unnamed: 1_level_1
2018-11-20 20:53:00,0
