In [1]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

columns_required = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
path= '~/.kaggle/competitions/talkingdata-adtracking-fraud-detection/'
# Load subset of the training data
X_train = pd.read_csv('../input/train.csv',nrows=100000,usecols=columns_required,parse_dates=['click_time'])

# Show the head of the table
X_train.head()
X_train.shape

(100000, 7)

In [2]:
# segregating the click_time column into day, hour, minute and second

In [3]:
X_train['day'] = X_train['click_time'].dt.day.astype('uint8')
X_train['hour'] = X_train['click_time'].dt.hour.astype('uint8')
X_train['minute'] = X_train['click_time'].dt.minute.astype('uint8')
X_train['second'] = X_train['click_time'].dt.second.astype('uint8')
X_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,minute,second
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14,32,21
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14,33,34
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14,34,12
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14,34,52
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14,35,8


In [4]:
X_train.loc[:, 'device'] = X_train.loc[:, 'device'].astype(np.int16)
X_train.loc[:, 'os'] = X_train.loc[:, 'os'].astype(np.int16)
X_train.loc[:, 'channel'] = X_train.loc[:, 'channel'].astype(np.int16)
X_train.loc[:, 'is_attributed'] = X_train.loc[:, 'is_attributed'].astype(np.int8)
X_train.loc[:, 'ip'] = X_train.loc[:, 'ip'].astype(np.int8)
X_train.loc[:, 'app'] = X_train.loc[:, 'app'].astype(np.int8)


In [5]:
X_train.shape

(100000, 11)

In [6]:
columns_required = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

# Load subset of the training data
X_test = pd.read_csv('../input/test.csv',nrows=100000,usecols=columns_required,parse_dates=['click_time'])
X_test.fillna(X_test.mean(), inplace=True)
# Show the head of the table
X_test.head()
X_test.shape

(100000, 6)

In [7]:
X_test['day'] = X_test['click_time'].dt.day.astype('uint8')
X_test['hour'] = X_test['click_time'].dt.hour.astype('uint8')
X_test['minute'] = X_test['click_time'].dt.minute.astype('uint8')
X_test['second'] = X_test['click_time'].dt.second.astype('uint8')
X_test.head()

Unnamed: 0,ip,app,device,os,channel,click_time,day,hour,minute,second
0,5744,9,1,3,107,2017-11-10 04:00:00,10,4,0,0
1,119901,9,1,3,466,2017-11-10 04:00:00,10,4,0,0
2,72287,21,1,19,128,2017-11-10 04:00:00,10,4,0,0
3,78477,15,1,13,111,2017-11-10 04:00:00,10,4,0,0
4,123080,12,1,13,328,2017-11-10 04:00:00,10,4,0,0


In [8]:
X_test.loc[:, 'device'] = X_test.loc[:, 'device'].astype(np.int16)
X_test.loc[:, 'os'] = X_test.loc[:, 'os'].astype(np.int16)
X_test.loc[:, 'channel'] = X_test.loc[:, 'channel'].astype(np.int16)
X_test.loc[:, 'ip'] = X_test.loc[:, 'ip'].astype(np.int8)
X_test.loc[:, 'app'] = X_test.loc[:, 'app'].astype(np.int8)


In [9]:
ATTRIBUTION_CATEGORIES = [        
    # Group-1 Features 
    ['ip'], ['app'], ['device'], ['os'], ['channel'],
    
    # Group-2 Features
    ['app', 'channel'],
    ['app', 'os'],
    ['app', 'device'],
    
    # Group-3 Features
    ['channel', 'os'],
    ['channel', 'device'],
    ['os', 'device']
]


In [10]:
# Find frequency of is_attributed for each unique value in column in train data
freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    
    
    # Perform the groupby
    group_object = X_train.groupby(cols)
    
    # Group sizes    
    group_sizes = group_object.size()
    log_group = np.log(100000) 
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf
    
    # Merge operation
    X_train = X_train.merge(
        group_object['is_attributed']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_attributed': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    
X_train.head()

>> Calculating confidence-weighted rate for: ['ip'].
   Saving to: ip_confRate. Group Max /Mean / Median / Min: 1059 / 390.62 / 375.0 / 205
>> Calculating confidence-weighted rate for: ['app'].
   Saving to: app_confRate. Group Max /Mean / Median / Min: 13280 / 909.09 / 6.0 / 1
>> Calculating confidence-weighted rate for: ['device'].
   Saving to: device_confRate. Group Max /Mean / Median / Min: 94397 / 1694.92 / 1.0 / 1
>> Calculating confidence-weighted rate for: ['os'].
   Saving to: os_confRate. Group Max /Mean / Median / Min: 23957 / 1052.63 / 67.0 / 1
>> Calculating confidence-weighted rate for: ['channel'].
   Saving to: channel_confRate. Group Max /Mean / Median / Min: 10582 / 751.88 / 188.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'channel'].
   Saving to: app_channel_confRate. Group Max /Mean / Median / Min: 10015 / 316.46 / 24.5 / 1
>> Calculating confidence-weighted rate for: ['app', 'os'].
   Saving to: app_os_confRate. Group Max /Mean / Median / Min: 3236 

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,minute,second,ip_confRate,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate
0,30,3,1,13,379,2017-11-06 14:32:21,0,6,14,32,21,0.001306,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187
1,-51,3,1,19,379,2017-11-06 14:33:34,0,6,14,33,34,0.0,0.000296,0.001307,0.001389,0.0,0.0,0.0,0.000306,0.0,0.0,0.001429
2,-30,3,1,13,379,2017-11-06 14:34:12,0,6,14,34,12,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187
3,-79,14,1,13,478,2017-11-06 14:34:52,0,6,14,34,52,0.000671,0.000685,0.001307,0.001149,0.0,0.0,0.0,0.000698,0.0,0.0,0.001187
4,-17,3,1,13,379,2017-11-06 14:35:08,0,6,14,35,8,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187


In [11]:
#Test Data
freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    
    
    # Perform the groupby
    group_object = X_test.groupby(cols)
    
    # Group sizes    
    group_sizes = group_object.size()
    log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))


>> Calculating confidence-weighted rate for: ['ip'].
   Saving to: ip_confRate. Group Max /Mean / Median / Min: 839 / 390.62 / 380.0 / 222
>> Calculating confidence-weighted rate for: ['app'].
   Saving to: app_confRate. Group Max /Mean / Median / Min: 15358 / 787.4 / 4.0 / 1
>> Calculating confidence-weighted rate for: ['device'].
   Saving to: device_confRate. Group Max /Mean / Median / Min: 95400 / 1041.67 / 1.0 / 1
>> Calculating confidence-weighted rate for: ['os'].
   Saving to: os_confRate. Group Max /Mean / Median / Min: 24304 / 952.38 / 65.0 / 1
>> Calculating confidence-weighted rate for: ['channel'].
   Saving to: channel_confRate. Group Max /Mean / Median / Min: 6042 / 689.66 / 272.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'channel'].
   Saving to: app_channel_confRate. Group Max /Mean / Median / Min: 4746 / 265.25 / 12.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'os'].
   Saving to: app_os_confRate. Group Max /Mean / Median / Min: 3716 / 60.

In [12]:
# Define all the groupby transformations
GROUPBY_AGGREGATIONS = [
    
    # Group-1 - GroupBy Features    
    # Variance in day, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'day', 'agg': 'var'},
    # Variance in hour, for ip-app-os
    {'groupby': ['ip','app','os'], 'select': 'hour', 'agg': 'var'},
    # Variance in hour, for ip-day-channel
    {'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'},
    # Count, for ip-day-hour
    {'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app
    {'groupby': ['ip', 'app'], 'select': 'channel', 'agg': 'count'},        
    # Count, for ip-app-os
    {'groupby': ['ip', 'app', 'os'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app-day-hour
    {'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Mean hour, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'hour', 'agg': 'mean'}, 
    
    # Group-2 - GroupBy Features 
    # Average clicks on app by distinct users; is it an app they return to?
    {'groupby': ['app'], 
     'select': 'ip', 
     'agg': lambda x: float(len(x)) / len(x.unique()), 
     'agg_name': 'AvgViewPerDistinct'
    },
    # How popular is the app or channel?
    {'groupby': ['app'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['channel'], 'select': 'app', 'agg': 'count'},
    
    # Group-3 - GroupBy Features     
    # Reference from https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 
    {'groupby': ['ip'], 'select': 'channel', 'agg': 'nunique'}, 
    {'groupby': ['ip'], 'select': 'app', 'agg': 'nunique'}, 
    {'groupby': ['ip','day'], 'select': 'hour', 'agg': 'nunique'}, 
    {'groupby': ['ip','app'], 'select': 'os', 'agg': 'nunique'}, 
    {'groupby': ['ip'], 'select': 'device', 'agg': 'nunique'}, 
    {'groupby': ['app'], 'select': 'channel', 'agg': 'nunique'}, 
    {'groupby': ['ip', 'device', 'os'], 'select': 'app', 'agg': 'nunique'}, 
    {'groupby': ['ip','device','os'], 'select': 'app', 'agg': 'cumcount'}, 
    {'groupby': ['ip'], 'select': 'app', 'agg': 'cumcount'}, 
    {'groupby': ['ip'], 'select': 'os', 'agg': 'cumcount'}, 
    {'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'}   
    
]

In [13]:
# Apply all the groupby transformations
for spec in GROUPBY_AGGREGATIONS:
    
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    
    # Info
    print("Grouping by {}, and aggregating {} with {}".format(
        spec['groupby'], spec['select'], agg_name
    ))
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
     # Perform the groupby
    gp = X_train[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_total
    if 'cumcount' == spec['agg']:
        X_train[new_feature] = gp[0].values
    else:
        X_train = X_train.merge(gp, on=spec['groupby'], how='left')
        
     # Clear memory
    del gp
    gc.collect()

X_train.head()

Grouping by ['ip', 'app', 'channel'], and aggregating day with var
Grouping by ['ip', 'app', 'os'], and aggregating hour with var
Grouping by ['ip', 'day', 'channel'], and aggregating hour with var
Grouping by ['ip', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app'], and aggregating channel with count
Grouping by ['ip', 'app', 'os'], and aggregating channel with count
Grouping by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app', 'channel'], and aggregating hour with mean
Grouping by ['app'], and aggregating ip with AvgViewPerDistinct
Grouping by ['app'], and aggregating channel with count
Grouping by ['channel'], and aggregating app with count
Grouping by ['ip'], and aggregating channel with nunique
Grouping by ['ip'], and aggregating app with nunique
Grouping by ['ip', 'day'], and aggregating hour with nunique
Grouping by ['ip', 'app'], and aggregating os with nunique
Grouping by ['ip'], and aggregating device with nu

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,minute,second,ip_confRate,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate,ip_app_channel_var_day,ip_app_os_var_hour,ip_day_channel_var_hour_x,ip_day_hour_count_channel,ip_app_count_channel,ip_app_os_count_channel,ip_app_day_hour_count_channel,ip_app_channel_mean_hour,app_AvgViewPerDistinct_ip,app_count_channel,channel_count_app,ip_nunique_channel,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour_y
0,30,3,1,13,379,2017-11-06 14:32:21,0,6,14,32,21,0.001306,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.358974,0.7,2,40,13,2,14.8,42.640625,10916,1353,69,28,3,13,4,29,14,0,0,0,0.7
1,-51,3,1,19,379,2017-11-06 14:33:34,0,6,14,33,34,0.0,0.000296,0.001307,0.001389,0.0,0.0,0.0,0.000306,0.0,0.0,0.001429,0.0,0.266667,0.5,1,55,15,1,15.5,42.640625,10916,1353,68,23,3,14,4,29,16,0,0,0,0.5
2,-30,3,1,13,379,2017-11-06 14:34:12,0,6,14,34,12,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.571429,0.916667,1,28,7,1,14.5,42.640625,10916,1353,60,25,3,13,3,29,15,0,0,0,0.916667
3,-79,14,1,13,478,2017-11-06 14:34:52,0,6,14,34,52,0.000671,0.000685,0.001307,0.001149,0.0,0.0,0.0,0.000698,0.0,0.0,0.001187,,0.444444,1.333333,1,31,9,1,14.0,11.925781,3053,126,74,31,3,11,5,19,21,0,0,0,1.333333
4,-17,3,1,13,379,2017-11-06 14:35:08,0,6,14,35,8,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.25,0.571429,1,62,16,1,15.0,42.640625,10916,1353,69,28,2,20,4,29,19,0,0,0,0.571429


In [14]:
#Test Data 
#Apply all the groupby transformations
for spec in GROUPBY_AGGREGATIONS:
    
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    
    # Info
    print("Grouping by {}, and aggregating {} with {}".format(
        spec['groupby'], spec['select'], agg_name
    ))
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
     # Perform the groupby
    gp = X_test[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_total
    if 'cumcount' == spec['agg']:
        X_test[new_feature] = gp[0].values
    else:
        X_test= X_test.merge(gp, on=spec['groupby'], how='left')
        
     # Clear memory
    del gp
    gc.collect()

X_test.head()

Grouping by ['ip', 'app', 'channel'], and aggregating day with var
Grouping by ['ip', 'app', 'os'], and aggregating hour with var
Grouping by ['ip', 'day', 'channel'], and aggregating hour with var
Grouping by ['ip', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app'], and aggregating channel with count
Grouping by ['ip', 'app', 'os'], and aggregating channel with count
Grouping by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app', 'channel'], and aggregating hour with mean
Grouping by ['app'], and aggregating ip with AvgViewPerDistinct
Grouping by ['app'], and aggregating channel with count
Grouping by ['channel'], and aggregating app with count
Grouping by ['ip'], and aggregating channel with nunique
Grouping by ['ip'], and aggregating app with nunique
Grouping by ['ip', 'day'], and aggregating hour with nunique
Grouping by ['ip', 'app'], and aggregating os with nunique
Grouping by ['ip'], and aggregating device with nu

Unnamed: 0,ip,app,device,os,channel,click_time,day,hour,minute,second,ip_app_channel_var_day,ip_app_os_var_hour,ip_day_channel_var_hour_x,ip_day_hour_count_channel,ip_app_count_channel,ip_app_os_count_channel,ip_app_day_hour_count_channel,ip_app_channel_mean_hour,app_AvgViewPerDistinct_ip,app_count_channel,channel_count_app,ip_nunique_channel,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour_y
0,112,9,1,3,107,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,380,57,6,57,4,59.992188,15358,6042,73,26,1,18,2,25,6,0,0,0,0.0
1,93,9,1,3,466,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,365,72,2,72,4,59.992188,15358,2722,71,27,1,20,4,25,8,0,0,0,0.0
2,95,21,1,19,128,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,465,12,5,12,4,13.414062,3434,3051,76,24,1,7,4,2,17,0,0,0,0.0
3,-115,15,1,13,111,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,411,24,9,24,4,24.335938,6230,361,77,27,1,9,3,22,18,0,0,0,0.0
4,-56,12,1,13,328,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,407,53,14,53,4,50.722656,12985,1060,71,23,1,19,5,26,17,0,0,0,0.0


In [15]:
# Train Data
GROUP_BY_NEXT_CLICKS = [
    
    # Group-1
    {'groupby': ['ip']},
    {'groupby': ['ip', 'app']},
    {'groupby': ['ip', 'channel']},
    {'groupby': ['ip', 'os']},
    
    # Group-3
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device']},
    {'groupby': ['ip', 'os', 'device', 'app']}
]

# Calculate the time to next click for each group
for t in GROUP_BY_NEXT_CLICKS:
    
    # Name of new feature
    new_feature = '{}_nextClick'.format('_'.join(t['groupby']))    
    
    # Unique list of features to select
    all_features = t['groupby'] + ['click_time']
    
    # Run calculation
    print(f">> Grouping by {t['groupby']}, and saving time to next click in: {new_feature}")
    X_train[new_feature] = X_train[all_features].groupby(t['groupby']).click_time.transform(lambda x: x.diff().shift(-1)).dt.seconds
    
X_train.head()

>> Grouping by ['ip'], and saving time to next click in: ip_nextClick
>> Grouping by ['ip', 'app'], and saving time to next click in: ip_app_nextClick
>> Grouping by ['ip', 'channel'], and saving time to next click in: ip_channel_nextClick
>> Grouping by ['ip', 'os'], and saving time to next click in: ip_os_nextClick
>> Grouping by ['ip', 'app', 'device', 'os', 'channel'], and saving time to next click in: ip_app_device_os_channel_nextClick
>> Grouping by ['ip', 'os', 'device'], and saving time to next click in: ip_os_device_nextClick
>> Grouping by ['ip', 'os', 'device', 'app'], and saving time to next click in: ip_os_device_app_nextClick


Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,minute,second,ip_confRate,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate,ip_app_channel_var_day,ip_app_os_var_hour,ip_day_channel_var_hour_x,ip_day_hour_count_channel,ip_app_count_channel,ip_app_os_count_channel,ip_app_day_hour_count_channel,ip_app_channel_mean_hour,app_AvgViewPerDistinct_ip,app_count_channel,channel_count_app,ip_nunique_channel,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour_y,ip_nextClick,ip_app_nextClick,ip_channel_nextClick,ip_os_nextClick,ip_app_device_os_channel_nextClick,ip_os_device_nextClick,ip_os_device_app_nextClick
0,30,3,1,13,379,2017-11-06 14:32:21,0,6,14,32,21,0.001306,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.358974,0.7,2,40,13,2,14.8,42.640625,10916,1353,69,28,3,13,4,29,14,0,0,0,0.7,946.0,946.0,946.0,4838.0,4838.0,4838.0,4838.0
1,-51,3,1,19,379,2017-11-06 14:33:34,0,6,14,33,34,0.0,0.000296,0.001307,0.001389,0.0,0.0,0.0,0.000306,0.0,0.0,0.001429,0.0,0.266667,0.5,1,55,15,1,15.5,42.640625,10916,1353,68,23,3,14,4,29,16,0,0,0,0.5,5177.0,5177.0,5177.0,5196.0,5263.0,5196.0,5205.0
2,-30,3,1,13,379,2017-11-06 14:34:12,0,6,14,34,12,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.571429,0.916667,1,28,7,1,14.5,42.640625,10916,1353,60,25,3,13,3,29,15,0,0,0,0.916667,4574.0,4574.0,4574.0,5150.0,,5150.0,5182.0
3,-79,14,1,13,478,2017-11-06 14:34:52,0,6,14,34,52,0.000671,0.000685,0.001307,0.001149,0.0,0.0,0.0,0.000698,0.0,0.0,0.001187,,0.444444,1.333333,1,31,9,1,14.0,11.925781,3053,126,74,31,3,11,5,19,21,0,0,0,1.333333,3948.0,5109.0,5137.0,4086.0,,4086.0,5109.0
4,-17,3,1,13,379,2017-11-06 14:35:08,0,6,14,35,8,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.25,0.571429,1,62,16,1,15.0,42.640625,10916,1353,69,28,2,20,4,29,19,0,0,0,0.571429,5092.0,5093.0,5098.0,5094.0,5098.0,5094.0,5098.0


In [16]:
#Test Data
GROUP_BY_NEXT_CLICKS = [
    
    # V1
    {'groupby': ['ip']},
    {'groupby': ['ip', 'app']},
    {'groupby': ['ip', 'channel']},
    {'groupby': ['ip', 'os']},
    
    # V3
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device']},
    {'groupby': ['ip', 'os', 'device', 'app']}
]

# Calculate the time to next click for each group
for t in GROUP_BY_NEXT_CLICKS:
    
    # Name of new feature
    new_feature = '{}_nextClick'.format('_'.join(t['groupby']))    
    
    # Unique list of features to select
    all_features = t['groupby'] + ['click_time']
    
    # Run calculation
    print(f">> Grouping by {t['groupby']}, and saving time to next click in: {new_feature}")
    X_test[new_feature] = X_test[all_features].groupby(t['groupby']).click_time.transform(lambda x: x.diff().shift(-1)).dt.seconds
    
X_test.head()

>> Grouping by ['ip'], and saving time to next click in: ip_nextClick
>> Grouping by ['ip', 'app'], and saving time to next click in: ip_app_nextClick
>> Grouping by ['ip', 'channel'], and saving time to next click in: ip_channel_nextClick
>> Grouping by ['ip', 'os'], and saving time to next click in: ip_os_nextClick
>> Grouping by ['ip', 'app', 'device', 'os', 'channel'], and saving time to next click in: ip_app_device_os_channel_nextClick
>> Grouping by ['ip', 'os', 'device'], and saving time to next click in: ip_os_device_nextClick
>> Grouping by ['ip', 'os', 'device', 'app'], and saving time to next click in: ip_os_device_app_nextClick


Unnamed: 0,ip,app,device,os,channel,click_time,day,hour,minute,second,ip_app_channel_var_day,ip_app_os_var_hour,ip_day_channel_var_hour_x,ip_day_hour_count_channel,ip_app_count_channel,ip_app_os_count_channel,ip_app_day_hour_count_channel,ip_app_channel_mean_hour,app_AvgViewPerDistinct_ip,app_count_channel,channel_count_app,ip_nunique_channel,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour_y,ip_nextClick,ip_app_nextClick,ip_channel_nextClick,ip_os_nextClick,ip_app_device_os_channel_nextClick,ip_os_device_nextClick,ip_os_device_app_nextClick
0,112,9,1,3,107,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,380,57,6,57,4,59.992188,15358,6042,73,26,1,18,2,25,6,0,0,0,0.0,0.0,4.0,0.0,6.0,,6.0,20.0
1,93,9,1,3,466,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,365,72,2,72,4,59.992188,15358,2722,71,27,1,20,4,25,8,0,0,0,0.0,0.0,1.0,22.0,2.0,,2.0,57.0
2,95,21,1,19,128,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,465,12,5,12,4,13.414062,3434,3051,76,24,1,7,4,2,17,0,0,0,0.0,0.0,9.0,15.0,4.0,88.0,4.0,16.0
3,-115,15,1,13,111,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,411,24,9,24,4,24.335938,6230,361,77,27,1,9,3,22,18,0,0,0,0.0,0.0,6.0,68.0,0.0,,0.0,11.0
4,-56,12,1,13,328,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,407,53,14,53,4,50.722656,12985,1060,71,23,1,19,5,26,17,0,0,0,0.0,0.0,2.0,5.0,0.0,,0.0,7.0


In [17]:
HISTORY_CLICKS = {
    'identical_clicks': ['ip', 'app', 'device', 'os', 'channel'],
    'app_clicks': ['ip', 'app']
}

# Go through different group-by combinations
for fname, fset in HISTORY_CLICKS.items():
    
    # Clicks in the past
    X_train['prev_'+fname] = X_train. \
        groupby(fset). \
        cumcount(). \
        rename('prev_'+fname)
        
    # Clicks in the future
    X_train['future_'+fname] = X_train.iloc[::-1]. \
        groupby(fset). \
        cumcount(). \
        rename('future_'+fname).iloc[::-1]

# Count cumulative subsequent clicks
X_train.head()


Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,minute,second,ip_confRate,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate,ip_app_channel_var_day,ip_app_os_var_hour,ip_day_channel_var_hour_x,ip_day_hour_count_channel,ip_app_count_channel,ip_app_os_count_channel,ip_app_day_hour_count_channel,ip_app_channel_mean_hour,app_AvgViewPerDistinct_ip,app_count_channel,channel_count_app,ip_nunique_channel,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour_y,ip_nextClick,ip_app_nextClick,ip_channel_nextClick,ip_os_nextClick,ip_app_device_os_channel_nextClick,ip_os_device_nextClick,ip_os_device_app_nextClick,prev_identical_clicks,future_identical_clicks,prev_app_clicks,future_app_clicks
0,30,3,1,13,379,2017-11-06 14:32:21,0,6,14,32,21,0.001306,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.358974,0.7,2,40,13,2,14.8,42.640625,10916,1353,69,28,3,13,4,29,14,0,0,0,0.7,946.0,946.0,946.0,4838.0,4838.0,4838.0,4838.0,0,1,0,39
1,-51,3,1,19,379,2017-11-06 14:33:34,0,6,14,33,34,0.0,0.000296,0.001307,0.001389,0.0,0.0,0.0,0.000306,0.0,0.0,0.001429,0.0,0.266667,0.5,1,55,15,1,15.5,42.640625,10916,1353,68,23,3,14,4,29,16,0,0,0,0.5,5177.0,5177.0,5177.0,5196.0,5263.0,5196.0,5205.0,0,2,0,54
2,-30,3,1,13,379,2017-11-06 14:34:12,0,6,14,34,12,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.571429,0.916667,1,28,7,1,14.5,42.640625,10916,1353,60,25,3,13,3,29,15,0,0,0,0.916667,4574.0,4574.0,4574.0,5150.0,,5150.0,5182.0,0,0,0,27
3,-79,14,1,13,478,2017-11-06 14:34:52,0,6,14,34,52,0.000671,0.000685,0.001307,0.001149,0.0,0.0,0.0,0.000698,0.0,0.0,0.001187,,0.444444,1.333333,1,31,9,1,14.0,11.925781,3053,126,74,31,3,11,5,19,21,0,0,0,1.333333,3948.0,5109.0,5137.0,4086.0,,4086.0,5109.0,0,0,0,30
4,-17,3,1,13,379,2017-11-06 14:35:08,0,6,14,35,8,0.0,0.000296,0.001307,0.001149,0.0,0.0,0.00027,0.000306,0.0,0.0,0.001187,0.0,0.25,0.571429,1,62,16,1,15.0,42.640625,10916,1353,69,28,2,20,4,29,19,0,0,0,0.571429,5092.0,5093.0,5098.0,5094.0,5098.0,5094.0,5098.0,0,1,0,61


In [18]:
# Test Data
HISTORY_CLICKS = {
    'identical_clicks': ['ip', 'app', 'device', 'os', 'channel'],
    'app_clicks': ['ip', 'app']
}

# Go through different group-by combinations
for fname, fset in HISTORY_CLICKS.items():
    
    # Clicks in the past
    X_test['prev_'+fname] = X_test. \
        groupby(fset). \
        cumcount(). \
        rename('prev_'+fname)
        
    # Clicks in the future
    X_test['future_'+fname] = X_test.iloc[::-1]. \
        groupby(fset). \
        cumcount(). \
        rename('future_'+fname).iloc[::-1]

# Count cumulative subsequent clicks
X_test.head()


Unnamed: 0,ip,app,device,os,channel,click_time,day,hour,minute,second,ip_app_channel_var_day,ip_app_os_var_hour,ip_day_channel_var_hour_x,ip_day_hour_count_channel,ip_app_count_channel,ip_app_os_count_channel,ip_app_day_hour_count_channel,ip_app_channel_mean_hour,app_AvgViewPerDistinct_ip,app_count_channel,channel_count_app,ip_nunique_channel,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour_y,ip_nextClick,ip_app_nextClick,ip_channel_nextClick,ip_os_nextClick,ip_app_device_os_channel_nextClick,ip_os_device_nextClick,ip_os_device_app_nextClick,prev_identical_clicks,future_identical_clicks,prev_app_clicks,future_app_clicks
0,112,9,1,3,107,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,380,57,6,57,4,59.992188,15358,6042,73,26,1,18,2,25,6,0,0,0,0.0,0.0,4.0,0.0,6.0,,6.0,20.0,0,0,0,56
1,93,9,1,3,466,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,365,72,2,72,4,59.992188,15358,2722,71,27,1,20,4,25,8,0,0,0,0.0,0.0,1.0,22.0,2.0,,2.0,57.0,0,0,0,71
2,95,21,1,19,128,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,465,12,5,12,4,13.414062,3434,3051,76,24,1,7,4,2,17,0,0,0,0.0,0.0,9.0,15.0,4.0,88.0,4.0,16.0,0,1,0,11
3,-115,15,1,13,111,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,411,24,9,24,4,24.335938,6230,361,77,27,1,9,3,22,18,0,0,0,0.0,0.0,6.0,68.0,0.0,,0.0,11.0,0,0,0,23
4,-56,12,1,13,328,2017-11-10 04:00:00,10,4,0,0,0.0,0.0,0.0,407,53,14,53,4,50.722656,12985,1060,71,23,1,19,5,26,17,0,0,0,0.0,0.0,2.0,5.0,0.0,,0.0,7.0,0,0,0,52


In [19]:
# Split into X and y
X_train.fillna(X_train.mean(), inplace=True)
y_train = X_train['is_attributed']
X_train= X_train.drop('is_attributed', axis=1).select_dtypes(include=[np.number])
# Oversampling to decrease imbalance in labels
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_train, y_train = sm.fit_sample(X_train, y_train)
#Shuffle the data to train well
from sklearn.utils import shuffle
shuffle(X_train)


Using TensorFlow backend.


array([[-31.        ,   3.        ,   1.        , ...,   0.        ,
          3.        ,  24.        ],
       [-86.8569152 ,  41.82221081,   1.        , ...,   0.        ,
          0.        ,   0.        ],
       [ 15.        ,   8.        ,   1.        , ...,   1.        ,
         76.        ,   1.        ],
       ...,
       [120.        ,  15.        ,   1.        , ...,   0.        ,
          2.        ,  40.        ],
       [-48.4336329 ,  19.        ,   0.        , ...,   0.76287198,
          0.76287198,   0.76287198],
       [  1.        ,   3.        ,   1.        , ...,   0.        ,
          4.        ,  35.        ]])

In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_train = SelectKBest(chi2, k=42).fit_transform(abs(X_train), y_train)
X_train.shape

(199662, 42)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid= train_test_split(X_train,y_train,test_size=0.3, random_state=0)

In [22]:
# Create a model
# Params from: https://www.kaggle.com/aharless/swetha-s-xgboost-revised
import xgboost as xgb
clf_xgBoost = xgb.XGBClassifier(max_depth = 4,subsample = 0.8,colsample_bytree = 0.7,colsample_bylevel = 0.7,scale_pos_weight = 9,
    min_child_weight = 0,reg_alpha = 0.01,n_jobs = -1, objective = 'binary:logistic')
# Fit the models
clf_xgBoost.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=0, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0.01, reg_lambda=1, scale_pos_weight=9,
       seed=None, silent=True, subsample=0.8)

In [23]:
from sklearn.metrics import roc_auc_score
y_pred=clf_xgBoost.predict(X_valid)

In [24]:
roc_auc_score(y_valid, y_pred)

0.9942040132562224

In [25]:
from sklearn.ensemble import RandomForestClassifier

clf1=RandomForestClassifier(n_jobs=-1,criterion="entropy",min_samples_leaf=1,min_samples_split=8, \
                                                n_estimators=15,max_features=None,random_state=100)
clf1.fit(X_train,y_train)
                        
    
    
y_pred_rf=clf1.predict(X_valid)
print(y_pred_rf)
    




[1 1 1 ... 1 1 0]


In [26]:
roc_auc_score(y_valid, y_pred_rf)

0.9989472167041477

In [27]:
X_test.drop('click_time',axis=1,inplace=True)

In [28]:
X_test.fillna(X_test.mean(), inplace=True)


In [29]:
y_pred_t=clf1.predict(X_test)

In [30]:
# Create submission file
submission = pd.DataFrame({'click_id':[i for i in range(len(y_pred_t))],'is_attributed':y_pred_t})
submission.to_csv('submission.csv', index=False)