In [1]:
# THANK YOU AND ACKNOLEDGEMENTS:
# This kernel develops further the ideas suggested in:
#   *  "lgbm starter - early stopping 0.9539" by Aloisio Dourado, https://www.kaggle.com/aloisiodn/lgbm-starter-early-stopping-0-9539/code
#   * "LightGBM (Fixing unbalanced data)" by Pranav Pandya, https://www.kaggle.com/pranav84/lightgbm-fixing-unbalanced-data-auc-0-9787?scriptVersionId=2777211
#   * "LightGBM with count features" by Ravi Teja Gutta, https://www.kaggle.com/rteja1113/lightgbm-with-count-features
# I would like to extend my gratitude to these individuals for sharing their work.

# WHAT IS NEW IN THIS VERSION? 
# In addition to some cosmetic changes to the code/LightGBM parameters, I am adding the 'ip' feature to and 
# removing the 'day' feature from the training set, and using the last chunk of the training data to build the model.

# What new is NICKS VERSION?
#1 Added Day of Week Time Variable, A IP Count Variable, Feature Importance
#2 Increased validation set to 15%
#3 Imbalanced parameter for lgbm, lower learning rate
#4 new variables- "ip_hour_channel", "ip_hour_os", "ip_hour_app","ip_hour_device"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc

path = '../input/' 
path_train = path + 'train.csv'
path_test = path + 'test.csv'

train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
        
skip = range(1, 140000000)
print("Loading Data")
#skiprows=skip, 
train = pd.read_csv(path_train, dtype=dtypes,
        header=0,usecols=train_cols,parse_dates=["click_time"])#.sample(1000)
test = pd.read_csv(path_test, dtype=dtypes, header=0,
        usecols=test_cols,parse_dates=["click_time"])#.sample(1000)
#test['is_attributed'] = -1

len_train = len(train)
print('The initial size of the train set is', len_train)
print('The initial size of the test set is', len(test))
print('Binding the training and test set together...')

Loading Data
The initial size of the train set is 184903890
The initial size of the test set is 18790469
Binding the training and test set together...


In [2]:
print("Creating new time features in train: 'hour' and 'day'...")
train['hour'] = train["click_time"].dt.hour.astype('uint8')
train['day'] = train["click_time"].dt.day.astype('uint8')


print("Creating new time features in test: 'hour' and 'day'...")
test['hour'] = test["click_time"].dt.hour.astype('uint8')
test['day'] = test["click_time"].dt.day.astype('uint8')

Creating new time features in train: 'hour' and 'day'...
Creating new time features in test: 'hour' and 'day'...


In [3]:

    
#sample 1/4 of the data:
train = train.set_index('ip').loc[lambda x: (x.index + 401) % 4 == 0].reset_index()
len_train = len(train)

print(train)

              ip  app  device  os  channel          click_time  is_attributed  \
0         161007    3       1  13      379 2017-11-06 14:35:08              0   
1          18787    3       1  16      379 2017-11-06 14:36:26              0   
2         124979    3       1  18      379 2017-11-06 14:40:16              0   
3          80447    3       1  19      379 2017-11-06 14:40:51              0   
4         134575    3       1  13      379 2017-11-06 14:43:10              0   
5           7755    3       1  13      379 2017-11-06 14:43:25              0   
6         191759    3       1  13      379 2017-11-06 14:44:51              0   
7         209663    3       1  13      379 2017-11-06 14:48:55              0   
8         208347    3       1  19      379 2017-11-06 14:49:38              0   
9          73503    3       1  18      379 2017-11-06 14:49:43              0   
10         28739    3       1  13      379 2017-11-06 14:50:29              0   
11        103175   18       

In [4]:
#train_sample = train.sample(10000)
#print(train_sample)

train_ip_contains_9 = train.groupby('ip').filter(lambda x: x['day'].max() == 9)

print('train_ip_contains_9', train_ip_contains_9)
print('train_ip_contains_9 unique ips:', len(train_ip_contains_9['ip'].unique()))

train_ip_contains_9               ip  app  device  os  channel          click_time  is_attributed  \
1          18787    3       1  16      379 2017-11-06 14:36:26              0   
2         124979    3       1  18      379 2017-11-06 14:40:16              0   
3          80447    3       1  19      379 2017-11-06 14:40:51              0   
4         134575    3       1  13      379 2017-11-06 14:43:10              0   
6         191759    3       1  13      379 2017-11-06 14:44:51              0   
7         209663    3       1  13      379 2017-11-06 14:48:55              0   
8         208347    3       1  19      379 2017-11-06 14:49:38              0   
10         28739    3       1  13      379 2017-11-06 14:50:29              0   
11        103175   18       1  17      376 2017-11-06 14:53:23              0   
12         74715    3       1  19      379 2017-11-06 14:55:25              0   
13        128855    3       1  13      379 2017-11-06 14:58:16              0   
14      

In [5]:
train_ip_contains_9 = train_ip_contains_9.query('day < 9')
#print(train_ip_contains_9)
print('train_ip_contains_9 unique ips:', len(train_ip_contains_9['ip'].unique()))

train_ip_contains_9 unique ips: 15322


In [6]:
print('split attributed data:')
train_ip_contains_9_attributed = train_ip_contains_9.query('is_attributed == 1')
print(train_ip_contains_9_attributed)

#only use data on 9 to train, but data before 9 as features

train = train.query('day == 9')
print('training data len:', len(train))

split attributed data:
              ip  app  device  os  channel          click_time  is_attributed  \
2037      165595   19     379  24      213 2017-11-06 16:00:10              1   
3445       73487   11       1  17      481 2017-11-06 16:00:18              1   
5736      119039   35       1  22      274 2017-11-06 16:00:28              1   
8553       73655    2       1   6      219 2017-11-06 16:00:39              1   
8655       40995   35       1  19       21 2017-11-06 16:00:39              1   
12308      40011  155       1  13      101 2017-11-06 16:00:52              1   
12377      86291   35       1  18       21 2017-11-06 16:00:52              1   
12687      46283   19       0  24      213 2017-11-06 16:00:53              1   
12853       5135   29       1  19      213 2017-11-06 16:00:54              1   
15079     115947   35       1  19       21 2017-11-06 16:01:02              1   
16118     112095   35       1  13       21 2017-11-06 16:01:06              1   
16813

In [7]:
#train=train.append(test)

for_test = False

if for_test:
    del train
    del test
    gc.collect()

    #prepare test data:
    train = pd.read_csv(path_train, dtype=dtypes,
            header=0,usecols=train_cols,parse_dates=["click_time"])#.sample(1000)
    test = pd.read_csv(path_test, dtype=dtypes, header=0,
            usecols=test_cols,parse_dates=["click_time"])#.sample(1000)
    train=train.append(test)
    print("Creating new time features in train: 'hour' and 'day'...")
    train['hour'] = train["click_time"].dt.hour.astype('uint8')
    train['day'] = train["click_time"].dt.day.astype('uint8')

    train_ip_contains_10 = train.groupby('ip').filter(lambda x: x['day'].max() == 10)
    print('train_ip_contains_10 len:', len(train_ip_contains_10))
    print('train_ip_contains_10 unique ips:', len(train_ip_contains_10['ip'].unique()))
    train_ip_contains_10 = train_ip_contains_10.query('day < 10 & day > 6')
    print('after removing data on 10 len:', len(train_ip_contains_10))
    print('train_ip_contains_10 unique ips:', len(train_ip_contains_10['ip'].unique()))
    print('split attributed data:')
    train_ip_contains_10_attributed = train_ip_contains_10.query('is_attributed == 1')
    print(train_ip_contains_10_attributed)

In [8]:
def add_statistic_feature(group_by_cols, training, training_hist, training_hist_attribution, 
                          with_hist, counting_col='channel'):
    features_added = []
    feature_name_added = '_'.join(group_by_cols) + 'count'
    print('count ip with group by:', group_by_cols)
    n_chans = training[group_by_cols + [counting_col]].groupby(by=group_by_cols)[[counting_col]] \
        .count().reset_index().rename(columns={counting_col: feature_name_added})
    training = training.merge(n_chans, on=group_by_cols, how='left')
    del n_chans
    gc.collect()
    training[feature_name_added] = training[feature_name_added].astype('uint16')
    features_added.append(feature_name_added)
    
    if with_hist:
        print('count ip with group by in hist data:', group_by_cols)
        feature_name_added = '_'.join(group_by_cols) + "count_in_hist"
        n_chans = training_hist[group_by_cols + [counting_col]].groupby(by=group_by_cols)[[counting_col]] \
            .count().reset_index().rename(columns={counting_col: feature_name_added})
        training = training.merge(n_chans, on=group_by_cols, how='left')
        del n_chans
        gc.collect()
        #training[feature_name_added] = training[feature_name_added].astype('uint16')
        print('count ip attribution with group by in hist data:', group_by_cols)
        feature_name_added1 = '_'.join(group_by_cols) + "count_attribution_in_hist"
        n_chans = training_hist_attribution[group_by_cols + [counting_col]] \
            .groupby(by=group_by_cols)[[counting_col]] \
            .count().reset_index().rename(columns={counting_col: feature_name_added1 })
        training = training.merge(n_chans, on=group_by_cols, how='left')
        del n_chans
        gc.collect()
        #training[feature_name_added1] = training[feature_name_added1].astype('uint16')
                                               
        training['_'.join(group_by_cols) + "count_attribution_rate_in_hist"] = \
            training[feature_name_added] / training[feature_name_added1]
            
        features_added.append(feature_name_added)
        features_added.append(feature_name_added1)
        features_added.append('_'.join(group_by_cols) + "count_attribution_rate_in_hist")
        
    print('added features:', features_added)
                                               
    return training, features_added

def generate_counting_history_features(data, history, history_attribution):
        
    new_features = []

    # Count by IP,DAY,HOUR
    print('a given IP address within each hour...')
    data, features_added = add_statistic_feature(['ip','day','hour'], data, history, history_attribution, False)
    new_features = new_features + features_added
    gc.collect()

    # Count by IP and APP
    data, features_added = add_statistic_feature(['ip','app'], data, history, history_attribution, True)
    new_features = new_features + features_added
    
    data, features_added  = add_statistic_feature(['ip','app','os'], data, history, history_attribution, True)
    new_features = new_features + features_added

    #######
    # Count by IP
    data, features_added  = add_statistic_feature(['ip'], data, history, history_attribution, True)
    new_features = new_features + features_added

    # Count by IP HOUR CHANNEL                                               
    data, features_added  = add_statistic_feature(['ip','hour','channel'], \
        data, history, history_attribution, True, counting_col='os')
    new_features = new_features + features_added

    # Count by IP HOUR Device
    data, features_added  = add_statistic_feature(['ip','hour','os'], \
        data, history, history_attribution, True)
    new_features = new_features + features_added

    data, features_added  = add_statistic_feature(['ip','hour','app'], \
        data, history, history_attribution, True)
    new_features = new_features + features_added

    data, features_added  = add_statistic_feature(['ip','hour','device'], \
        data, history, history_attribution, True)
    new_features = new_features + features_added
    
    return data, new_features

test['hour'] = test["click_time"].dt.hour.astype('uint8')
test['day'] = test["click_time"].dt.day.astype('uint8')

if for_test:
    test, new_features = generate_counting_history_features(test, train_ip_contains_10, train_ip_contains_10_attributed)
else:
    train, new_features = generate_counting_history_features(train, train_ip_contains_9, train_ip_contains_9_attributed)

#print('test data:', test)
print('new features:', new_features)

#######

#print("Adjusting the data types of the new count features... ")
#train.info()
#train['ip_day_hour'] = train['ip_day_hour'].astype('uint16')
#train['ip_app_count'] = train['ip_app_count'].astype('uint16')
#train['ip_app_os_count'] = train['ip_app_os_count'].astype('uint16')

# added..
#train['count_by_ip'] = train['count_by_ip'].astype('uint16')
#train['ip_hour_channel'] = train['ip_hour_channel'].astype('uint16')
#train['ip_hour_os'] = train['ip_hour_os'].astype('uint16')
#train['ip_hour_app'] = train['ip_hour_app'].astype('uint16')
#train['ip_hour_device'] = train['ip_hour_device'].astype('uint16')

a given IP address within each hour...
count ip with group by: ['ip', 'day', 'hour']
added features: ['ip_day_hourcount']
count ip with group by: ['ip', 'app']
count ip with group by in hist data: ['ip', 'app']
count ip attribution with group by in hist data: ['ip', 'app']
added features: ['ip_appcount', 'ip_appcount_in_hist', 'ip_appcount_attribution_in_hist', 'ip_appcount_attribution_rate_in_hist']
count ip with group by: ['ip', 'app', 'os']
count ip with group by in hist data: ['ip', 'app', 'os']
count ip attribution with group by in hist data: ['ip', 'app', 'os']
added features: ['ip_app_oscount', 'ip_app_oscount_in_hist', 'ip_app_oscount_attribution_in_hist', 'ip_app_oscount_attribution_rate_in_hist']
count ip with group by: ['ip']
count ip with group by in hist data: ['ip']
count ip attribution with group by in hist data: ['ip']
added features: ['ipcount', 'ipcount_in_hist', 'ipcount_attribution_in_hist', 'ipcount_attribution_rate_in_hist']
count ip with group by: ['ip', 'hour', 

In [9]:
#len_train = 46849705
#test = train[len_train:]
#print('The size of the test set is ', len(test))

#r = 0.05 # the fraction of the train data to be used for validation
#val = train[(len_train-round(r*len_train)):len_train]
print(train)
val = train.set_index('ip').loc[lambda x: (x.index) % 17 == 0].reset_index()
print(val)
print('The size of the validation set is ', len(val))

gc.collect()

train = train.set_index('ip').loc[lambda x: (x.index) % 17 != 0].reset_index()
print('The size of the train set is ', len(train))

target = 'is_attributed'
train[target] = train[target].astype('uint8')
train.info()

              ip  app  device  os  channel          click_time  is_attributed  \
0         201143   11       1  13      487 2017-11-09 00:00:00              0   
1         205623    3       1  19      280 2017-11-09 00:00:00              0   
2         290423    3       1  13      280 2017-11-09 00:00:00              0   
3          79631   12       1  19      328 2017-11-09 00:00:00              0   
4         193455    3       1  11      489 2017-11-09 00:00:00              0   
5         183683    3       1  20      280 2017-11-09 00:00:00              0   
6          27499   12       1  19      328 2017-11-09 00:00:00              0   
7          89251   12       1  13      328 2017-11-09 00:00:00              0   
8           4451   27       1  19      122 2017-11-09 00:00:00              0   
9           6423   21       1  13      128 2017-11-09 00:00:00              0   
10        335107   11       1  19      173 2017-11-09 00:00:00              0   
11        117463   22       

            ip  app  device  os  channel          click_time  is_attributed  \
0        18683   18       1  15      134 2017-11-09 00:00:00              0   
1        92735    3       1  19      137 2017-11-09 00:00:00              0   
2        85663   18       1  19      439 2017-11-09 00:00:00              0   
3       148155   12       1  16      481 2017-11-09 00:00:00              0   
4       164475   12       1  14      178 2017-11-09 00:00:00              0   
5       107219   11       1  77      219 2017-11-09 00:00:00              0   
6       109735   12       1  17      265 2017-11-09 00:00:00              0   
7       123675   13       1  13      477 2017-11-09 00:00:00              0   
8       123675   13       1  13      477 2017-11-09 00:00:00              0   
9        84847   12       1  19      178 2017-11-09 00:00:00              0   
10       31467    3       1  19      466 2017-11-09 00:00:00              0   
11      108919   11       1  13      122 2017-11-09 

The size of the train set is  12678368
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12678368 entries, 0 to 12678367
Data columns (total 38 columns):
ip                                               uint64
app                                              uint16
device                                           uint16
os                                               uint16
channel                                          uint16
click_time                                       datetime64[ns]
is_attributed                                    uint8
hour                                             uint8
day                                              uint8
ip_day_hourcount                                 uint16
ip_appcount                                      uint16
ip_appcount_in_hist                              float64
ip_appcount_attribution_in_hist                  float64
ip_appcount_attribution_rate_in_hist             float64
ip_app_oscount                                   uint1

In [13]:
predictors0 = ['device', 'app', 'os', 'channel', 'hour', # Starter Vars, Then new features below
              'ip_day_hourcount','ipcount','ip_appcount', 'ip_app_oscount',
              "ip_hour_channelcount", "ip_hour_oscount", "ip_hour_appcount","ip_hour_devicecount"]

categorical = ['app', 'device', 'os', 'channel', 'hour']

predictors1 = categorical + new_features
#for ii in new_features:
#    predictors1 = predictors1 + ii
#print(predictors1)s
gc.collect()

#train.fillna(value={x:-1 for x in new_features})

print("Preparing the datasets for training...")

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 7,  
    'max_depth': 4,  
    'min_child_samples': 100,  
    'max_bin': 150,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
    'nthread': 5,
    'verbose': 9,
    #'is_unbalance': True,
    'scale_pos_weight':99 
    }
    
predictors_to_train = [predictors1]

for predictors in predictors_to_train:
    print('training with :', predictors)
    #print('training data: ', train[predictors].values)
    #print('validation data: ', val[predictors].values)
    dtrain = lgb.Dataset(train[predictors].values, label=train[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )
    dvalid = lgb.Dataset(val[predictors].values, label=val[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )

    evals_results = {}
    print("Training the model...")

    lgb_model = lgb.train(params, 
                     dtrain, 
                     valid_sets=[dtrain, dvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=1000,
                     early_stopping_rounds=30,
                     verbose_eval=50, 
                     feval=None)

    #del train
    #del val
    #gc.collect()

    # Nick's Feature Importance Plot
    import matplotlib.pyplot as plt
    f, ax = plt.subplots(figsize=[7,10])
    lgb.plot_importance(lgb_model, ax=ax, max_num_features=len(predictors))
    plt.title("Light GBM Feature Importance")
    plt.savefig('feature_import.png')

    # Feature names:
    print('Feature names:', lgb_model.feature_name())
    # Feature importances:
    print('Feature importances:', list(lgb_model.feature_importance()))

    feature_imp = pd.DataFrame(lgb_model.feature_name(),list(lgb_model.feature_importance()))

Preparing the datasets for training...
training with : ['app', 'device', 'os', 'channel', 'hour', 'ip_day_hourcount', 'ip_appcount', 'ip_appcount_in_hist', 'ip_appcount_attribution_in_hist', 'ip_appcount_attribution_rate_in_hist', 'ip_app_oscount', 'ip_app_oscount_in_hist', 'ip_app_oscount_attribution_in_hist', 'ip_app_oscount_attribution_rate_in_hist', 'ipcount', 'ipcount_in_hist', 'ipcount_attribution_in_hist', 'ipcount_attribution_rate_in_hist', 'ip_hour_channelcount', 'ip_hour_channelcount_in_hist', 'ip_hour_channelcount_attribution_in_hist', 'ip_hour_channelcount_attribution_rate_in_hist', 'ip_hour_oscount', 'ip_hour_oscount_in_hist', 'ip_hour_oscount_attribution_in_hist', 'ip_hour_oscount_attribution_rate_in_hist', 'ip_hour_appcount', 'ip_hour_appcount_in_hist', 'ip_hour_appcount_attribution_in_hist', 'ip_hour_appcount_attribution_rate_in_hist', 'ip_hour_devicecount', 'ip_hour_devicecount_in_hist', 'ip_hour_devicecount_attribution_in_hist', 'ip_hour_devicecount_attribution_rate_i



Training until validation scores don't improve for 30 rounds.
[50]	train's auc: 0.968601	valid's auc: 0.969823
[100]	train's auc: 0.973662	valid's auc: 0.972035
[150]	train's auc: 0.975835	valid's auc: 0.972532
[200]	train's auc: 0.977347	valid's auc: 0.9725
Early stopping, best iteration is:
[216]	train's auc: 0.977781	valid's auc: 0.972644
Feature names: ['app', 'device', 'os', 'channel', 'hour', 'ip_day_hourcount', 'ip_appcount', 'ip_appcount_in_hist', 'ip_appcount_attribution_in_hist', 'ip_appcount_attribution_rate_in_hist', 'ip_app_oscount', 'ip_app_oscount_in_hist', 'ip_app_oscount_attribution_in_hist', 'ip_app_oscount_attribution_rate_in_hist', 'ipcount', 'ipcount_in_hist', 'ipcount_attribution_in_hist', 'ipcount_attribution_rate_in_hist', 'ip_hour_channelcount', 'ip_hour_channelcount_in_hist', 'ip_hour_channelcount_attribution_in_hist', 'ip_hour_channelcount_attribution_rate_in_hist', 'ip_hour_oscount', 'ip_hour_oscount_in_hist', 'ip_hour_oscount_attribution_in_hist', 'ip_hour_

In [None]:
lgb_model.save_model('model.txt')

In [27]:
print("Preparing data for submission...")

submit = pd.read_csv(path_test, dtype='int', usecols=['click_id'])

print('submit test len:', len(submit))

print("Predicting the submission data...")

submit['is_attributed'] = lgb_model.predict(test[predictors], num_iteration=lgb_model.best_iteration)

print("Writing the submission data into a csv file...")

submit.to_csv("submission.csv",index=False)

print("All done...")

Preparing data for submission...
submit test len: 18790469
Predicting the submission data...
Writing the submission data into a csv file...
All done...


In [None]:

'''
Another CTR comp and so i suspect libffm will play its part, after all it is an atomic bomb for this kind of stuff.
A sci-kit learn inspired script to convert pandas dataframes into libFFM style data.

The script is fairly hacky (hey thats Kaggle) and takes a little while to run a huge dataset.
The key to using this class is setting up the features dtypes correctly for output (ammend transform to suit your needs)

Example below

'''


class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
    
    
    
