# Experiment

## References

* [1] https://www.analyticsvidhya.com/blog/2017/01/sentiment-analysis-of-twitter-posts-on-chennai-floods-using-python/
* [2] https://machinelearningmastery.com/clean-text-machine-learning-python/
* [3] https://pandas.pydata.org/pandas-docs/stable/api.html#datetimelike-properties

In [44]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import re
import gc
import math
import json
import random
import string
import numpy as np
import pandas as pd
from urllib import parse
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

import itertools
from scipy.stats import describe

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
word_net_lemmatizer = WordNetLemmatizer()
stopwords = set(nltk_stopwords.words('english'))

import xgboost
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import HashingVectorizer

from imblearn.under_sampling import RandomUnderSampler

In [4]:
df_campaign_data = pd.read_csv('data/train_HFxi8kT/campaign_data.csv')
df_campaign_data.columns = map(str.lower, df_campaign_data.columns)

In [5]:
df_campaign_data.head(2)

Unnamed: 0,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url
0,29,Newsletter,67,61,12,3,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...
1,30,Upcoming Events,18,14,7,1,"Dear AVians,\r\n \r\nAre your eager to know wh...",[July] Data Science Expert Meetups & Competiti...,http://r.newsletters.analyticsvidhya.com/7up0e...


# Feature engineering campaign_data.csv

In [6]:
def cleaner(text):  # [1]
    text = str(text)
    if text.startswith('@null'):
        return ''
    text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)
    text = re.sub(r'\$\w*', '', text)  # Remove tickers
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)  # Remove hyperlinks
    text = re.sub(r'[' + string.punctuation + ']+', ' ', text)  # Remove puncutations like 's
    return text

def tokenize(text):
    text = text.lower()
    tokens = word_tokenize(text, language='english')
    tokens = list(map(lambda token: word_net_lemmatizer.lemmatize(token), tokens))
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [token for token in tokens if token.isalpha()]  # remove all tokens that are not alphabetic # [2]
    tokens = [token for token in tokens if len(token) > 2]
    return tokens

In [7]:
df_campaign_data['communication_type'] = df_campaign_data['communication_type'].apply(cleaner)
df_campaign_data['email_body'] = df_campaign_data['email_body'].apply(cleaner)
df_campaign_data['subject'] = df_campaign_data['subject'].apply(cleaner)

In [8]:
df_campaign_data['tok_subject'] = df_campaign_data['subject'].map(lambda s: ' '.join(tokenize(s)))
df_campaign_data['tok_email_body'] = df_campaign_data['email_body'].map(lambda s: ' '.join(tokenize(s)))
df_campaign_data.drop(['email_body', 'subject'], axis=1, inplace=True)

In [9]:
def fill_url_t(url):
    r = parse.parse_qs(parse.urlparse(url).query)
    if 't' in r:
        return r['t'][0]
    else:
        return 'default'
df_campaign_data['url_t'] = df_campaign_data['email_url'].map(fill_url_t)
df_campaign_data.drop(['email_url'], axis=1, inplace=True)

In [10]:
df_campaign_data['c_type'] = df_campaign_data['communication_type'].map({v: k for k, v in enumerate(df_campaign_data['communication_type'].unique())})
df_campaign_data.drop(['communication_type'], axis=1, inplace=True)

In [11]:
df_campaign_data.head(2)

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,tok_subject,tok_email_body,url_t,c_type
0,29,67,61,12,3,sneak peek look emerging data science world,dear avians shaping superb data science ecosys...,1521180203,0
1,30,18,14,7,1,july data science expert meetups competition c...,dear avians eager know upcoming challenge meet...,1521180203,1


In [12]:
df_campaign_data['content_mean'] = df_campaign_data[['total_links', 'no_of_internal_links', 'no_of_images', 'no_of_sections']].mean(axis=1).apply(lambda x: round(x))
df_campaign_data['content_std'] = df_campaign_data[['total_links', 'no_of_internal_links', 'no_of_images', 'no_of_sections']].std(axis=1).apply(lambda x: round(x))

In [13]:
df_campaign_data['bins_tl'] = pd.cut(df_campaign_data['total_links'].astype(int), bins=10, labels=range(10), include_lowest=True)
df_campaign_data['bins_tl'] = df_campaign_data['bins_tl'].astype('int')

df_campaign_data['bins_il'] = pd.cut(df_campaign_data['no_of_internal_links'].astype(int), bins=8, labels=range(8), include_lowest=True)
df_campaign_data['bins_il'] = df_campaign_data['bins_il'].astype('int')

df_campaign_data['bins_i'] = pd.cut(df_campaign_data['no_of_images'].astype(int), bins=5, labels=range(5), include_lowest=True)
df_campaign_data['bins_i'] = df_campaign_data['bins_i'].astype('int')

# _, bins_s = pd.cut(df_campaign_data['no_of_sections'].astype(int), 3, retbins=True, labels=range(3)) 
df_campaign_data['bins_s'] = pd.cut(df_campaign_data['no_of_sections'].astype(int), bins=3, labels=range(3), include_lowest=True)
df_campaign_data['bins_s'] = df_campaign_data['bins_s'].astype('int')

df_campaign_data['bins_campaign'] = pd.cut(df_campaign_data['campaign_id'].astype(int), bins=3, labels=range(3), include_lowest=True)
df_campaign_data['bins_campaign'] = df_campaign_data['bins_campaign'].astype('int')

In [14]:
df_campaign_data.head()

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,tok_subject,tok_email_body,url_t,c_type,content_mean,content_std,bins_tl,bins_il,bins_i,bins_s,bins_campaign
0,29,67,61,12,3,sneak peek look emerging data science world,dear avians shaping superb data science ecosys...,1521180203,0,36,33,3,2,3,1,0
1,30,18,14,7,1,july data science expert meetups competition c...,dear avians eager know upcoming challenge meet...,1521180203,1,10,8,0,0,1,0,0
2,31,15,13,5,1,last chance convince bos early bird expires,early bird pricing till august save upto extra...,1521180195,2,8,7,0,0,1,0,0
3,32,24,19,7,1,machine learning reason attend datahack summit,dive attend data science conference let announ...,1521180198,2,13,11,0,0,1,0,0
4,33,7,3,1,1,delhi ncr fireside chat patil former chief dat...,fireside chat patil master dear avian super ex...,1521180195,3,3,3,0,0,0,0,0


In [15]:
combination_cols = ['c_type', 'bins_tl', 'bins_il', 'bins_i', 'bins_s', 'url_t', 'bins_campaign']
for col in combination_cols:
    df_x = df_campaign_data.groupby(col)['campaign_id'].agg(['count']).reset_index()
    df_x.columns = [col, col + '_1c_combined']
    df_campaign_data = df_campaign_data.merge(df_x, 'left', [col])

combination_cols = ['c_type', 'bins_tl', 'bins_il', 'bins_i', 'bins_s', 'url_t', 'bins_campaign']
for col_1 in combination_cols:
    col_1_idx = combination_cols.index(col_1)
    for col_2 in combination_cols[col_1_idx + 1:]:
        df_x = df_campaign_data.groupby([col_1, col_2])['campaign_id'].agg(['count']).reset_index()
        df_x.columns = [col_1, col_2, col_1 + col_2 + '_2c_combined']
        df_campaign_data = df_campaign_data.merge(df_x, 'left', [col_1, col_2])

In [16]:
df_campaign_data.head(10)

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,tok_subject,tok_email_body,url_t,c_type,content_mean,...,bins_ilbins_i_2c_combined,bins_ilbins_s_2c_combined,bins_ilurl_t_2c_combined,bins_ilbins_campaign_2c_combined,bins_ibins_s_2c_combined,bins_iurl_t_2c_combined,bins_ibins_campaign_2c_combined,bins_surl_t_2c_combined,bins_sbins_campaign_2c_combined,url_tbins_campaign_2c_combined
0,29,67,61,12,3,sneak peek look emerging data science world,dear avians shaping superb data science ecosys...,1521180203,0,36,...,3,4,1,3,8,1,4,1,4,2
1,30,18,14,7,1,july data science expert meetups competition c...,dear avians eager know upcoming challenge meet...,1521180203,1,10,...,9,33,1,13,9,1,3,1,13,2
2,31,15,13,5,1,last chance convince bos early bird expires,early bird pricing till august save upto extra...,1521180195,2,8,...,9,33,6,13,9,1,3,6,13,8
3,32,24,19,7,1,machine learning reason attend datahack summit,dive attend data science conference let announ...,1521180198,2,13,...,9,33,1,13,9,1,3,1,13,1
4,33,7,3,1,1,delhi ncr fireside chat patil former chief dat...,fireside chat patil master dear avian super ex...,1521180195,3,3,...,22,33,6,13,21,5,10,6,13,8
5,34,75,64,12,4,newsletter fireside chat patil datahack summit...,august newsletter dear avians one anticipated ...,1521180195,0,39,...,3,4,1,3,8,2,4,2,4,8
6,35,9,2,1,1,watch live fireside chat patil,fireside chat patil watch live dear avian live...,1521180195,3,3,...,22,33,6,13,21,5,10,6,13,8
7,36,13,11,2,1,kirk borne booz allen hamilton keynote datahac...,announcing kirk borne keynote speaker principa...,1521180195,2,7,...,22,33,6,13,21,5,10,6,13,8
8,37,9,7,3,1,last day early bird offer conference pass end ...,last day save maximum ticket extra saving day ...,1521180195,2,5,...,22,33,6,13,21,5,10,6,13,8
9,38,28,26,12,3,early bird offer expires day save spot,early bird pricing expires august extra saving...,1521180195,2,17,...,5,4,1,1,8,2,4,2,4,8


In [17]:
tok_subject_document = [row.tok_subject for row in df_campaign_data.itertuples()]
tok_email_body_document = [row.tok_email_body for row in df_campaign_data.itertuples()]

In [18]:
subject_features, email_features = 6, 25
tok_subject_hashvectorizer = HashingVectorizer(n_features=subject_features)
tok_email_body_hashvectorizer = HashingVectorizer(n_features=email_features)

subject_hash = tok_subject_hashvectorizer.transform(tok_subject_document)
email_hash = tok_email_body_hashvectorizer.transform(tok_email_body_document)

In [19]:
subject_hash_data = [tuple(subject_hash[i].toarray()[0]) for i in range(df_campaign_data.shape[0])]
email_hash_data = [tuple(email_hash[i].toarray()[0]) for i in range(df_campaign_data.shape[0])]

In [20]:
df_subject_hash_data = pd.DataFrame(subject_hash_data, columns=['sub_hash_' + str(s) for s in range(subject_features)])
df_subject_hash_data['campaign_id'] = df_campaign_data['campaign_id']
df_subject_hash_data.head()

Unnamed: 0,sub_hash_0,sub_hash_1,sub_hash_2,sub_hash_3,sub_hash_4,sub_hash_5,campaign_id
0,-0.57735,0.0,0.57735,0.57735,0.0,0.0,29
1,0.0,0.353553,0.353553,0.707107,0.353553,-0.353553,30
2,-0.333333,-0.666667,0.333333,0.333333,0.333333,0.333333,31
3,0.0,0.0,0.0,-0.707107,0.707107,0.0,32
4,-0.377964,-0.377964,-0.377964,0.755929,0.0,0.0,33


In [21]:
df_email_hash_data = pd.DataFrame(email_hash_data, columns=['email_hash_' + str(s) for s in range(email_features)])
df_email_hash_data['campaign_id'] = df_campaign_data['campaign_id']
df_email_hash_data.head()

Unnamed: 0,email_hash_0,email_hash_1,email_hash_2,email_hash_3,email_hash_4,email_hash_5,email_hash_6,email_hash_7,email_hash_8,email_hash_9,...,email_hash_16,email_hash_17,email_hash_18,email_hash_19,email_hash_20,email_hash_21,email_hash_22,email_hash_23,email_hash_24,campaign_id
0,-0.225494,-0.225494,0.0,0.150329,-0.075165,-0.300658,0.075165,-0.075165,0.225494,0.0,...,0.075165,0.225494,0.300658,-0.075165,-0.150329,0.075165,0.0,-0.450988,-0.300658,29
1,0.365148,0.0,0.0,-0.182574,0.0,0.0,-0.182574,0.0,0.0,0.0,...,0.0,0.182574,0.0,0.365148,0.182574,0.0,-0.365148,0.365148,0.0,30
2,-0.068359,0.0,-0.068359,-0.205076,0.068359,0.0,-0.068359,0.0,0.0,-0.205076,...,-0.068359,-0.136717,-0.273434,-0.273434,-0.068359,0.410152,0.47851,-0.136717,-0.068359,31
3,0.0,-0.113228,0.113228,0.0,-0.226455,0.226455,0.113228,0.0,0.0,-0.113228,...,0.0,0.0,-0.113228,0.0,-0.113228,0.113228,0.679366,-0.113228,-0.452911,32
4,0.112509,-0.112509,0.112509,-0.112509,0.225018,0.112509,-0.337526,-0.112509,-0.112509,0.112509,...,0.225018,0.0,0.225018,-0.337526,0.225018,0.225018,0.0,-0.112509,0.0,33


# Feature engineering train.csv

In [22]:
%%time

df_train = pd.read_csv('data/train_HFxi8kT/train.csv', parse_dates=['send_date']) 
df_test = pd.read_csv('data/test_BDIfz5B.csv/test_BDIfz5B.csv', parse_dates=['send_date']) # , parse_dates=['send_date']
df_train.columns = map(str.lower, df_train.columns)
df_test.columns = map(str.lower, df_test.columns)

Wall time: 5min 10s


In [23]:
df_train.head()

Unnamed: 0,id,user_id,campaign_id,send_date,is_open,is_click
0,42_14051,14051,42,2017-01-09 19:55:00,0,0
1,52_134438,134438,52,2017-02-11 12:53:00,0,0
2,33_181789,181789,33,2017-07-24 15:15:00,0,0
3,44_231448,231448,44,2017-05-09 11:36:00,0,0
4,29_185580,185580,29,2017-01-07 18:01:00,0,0


In [24]:
df_test.head()

Unnamed: 0,id,campaign_id,user_id,send_date
0,63_122715,63,122715,2018-01-02 22:35:00
1,56_76206,56,76206,2018-02-01 08:15:00
2,57_96189,57,96189,2018-05-01 18:25:00
3,56_166917,56,166917,2018-02-01 08:15:00
4,56_172838,56,172838,2018-02-01 08:12:00


In [25]:
df_train.drop('is_open', inplace=True, axis=1)
df_test['is_click'] = np.nan

In [26]:
df_train['label'] = 'train'
df_test['label'] = 'test'

In [27]:
df_train['send_hour'] = df_train['send_date'].dt.hour
df_train['send_minute'] = df_train['send_date'].dt.minute
df_train['send_dayofweek'] = df_train['send_date'].dt.dayofweek
df_train['send_daysinmonth'] = df_train['send_date'].dt.daysinmonth # [3]
df_train['send_day'] = df_train['send_date'].dt.day

df_test['send_hour'] = df_test['send_date'].dt.hour
df_test['send_minute'] = df_test['send_date'].dt.minute
df_test['send_dayofweek'] = df_test['send_date'].dt.dayofweek
df_test['send_daysinmonth'] = df_test['send_date'].dt.daysinmonth # [3]
df_test['send_day'] = df_test['send_date'].dt.day 

In [28]:
df_train['send_time'] = df_train['send_date'].dt.strftime('%H:%M:%S')
df_test['send_time'] = df_test['send_date'].dt.strftime('%H:%M:%S')

In [29]:
df_train['day_time'] = df_train['send_date'].dt.strftime('%d %H:%M:%S')
df_test['day_time'] = df_test['send_date'].dt.strftime('%d %H:%M:%S')

In [30]:
df_train.head()

Unnamed: 0,id,user_id,campaign_id,send_date,is_click,label,send_hour,send_minute,send_dayofweek,send_daysinmonth,send_day,send_time,day_time
0,42_14051,14051,42,2017-01-09 19:55:00,0,train,19,55,0,31,9,19:55:00,09 19:55:00
1,52_134438,134438,52,2017-02-11 12:53:00,0,train,12,53,5,28,11,12:53:00,11 12:53:00
2,33_181789,181789,33,2017-07-24 15:15:00,0,train,15,15,0,31,24,15:15:00,24 15:15:00
3,44_231448,231448,44,2017-05-09 11:36:00,0,train,11,36,1,31,9,11:36:00,09 11:36:00
4,29_185580,185580,29,2017-01-07 18:01:00,0,train,18,1,5,31,7,18:01:00,07 18:01:00


In [31]:
df_test.head()

Unnamed: 0,id,campaign_id,user_id,send_date,is_click,label,send_hour,send_minute,send_dayofweek,send_daysinmonth,send_day,send_time,day_time
0,63_122715,63,122715,2018-01-02 22:35:00,,test,22,35,1,31,2,22:35:00,02 22:35:00
1,56_76206,56,76206,2018-02-01 08:15:00,,test,8,15,3,28,1,08:15:00,01 08:15:00
2,57_96189,57,96189,2018-05-01 18:25:00,,test,18,25,1,31,1,18:25:00,01 18:25:00
3,56_166917,56,166917,2018-02-01 08:15:00,,test,8,15,3,28,1,08:15:00,01 08:15:00
4,56_172838,56,172838,2018-02-01 08:12:00,,test,8,12,3,28,1,08:12:00,01 08:12:00


In [32]:
df_train.drop('send_date', axis=1, inplace=True)
df_test.drop('send_date', axis=1, inplace=True)

In [33]:
_, bins_minute = pd.cut(pd.concat([df_train['send_minute'], df_test['send_minute']], ignore_index=True).astype(int), 20, retbins=True, labels=range(20)) 
df_train['send_minute_bin'] = pd.cut(df_train['send_minute'].astype(int), bins=bins_minute, labels=range(20), include_lowest=True)
df_train['send_minute_bin'] = df_train['send_minute_bin'].astype('int')
df_test['send_minute_bin'] = pd.cut(df_test['send_minute'].astype(int), bins=bins_minute, labels=range(20), include_lowest=True)
df_test['send_minute_bin'] = df_test['send_minute_bin'].astype('int')

_, bins_dow = pd.cut(pd.concat([df_train['send_dayofweek'], df_test['send_dayofweek']], ignore_index=True).astype(int), 3, retbins=True, labels=range(3)) 
df_train['send_dow_bin'] = pd.cut(df_train['send_dayofweek'].astype(int), bins=bins_dow, labels=range(3), include_lowest=True)
df_train['send_dow_bin'] = df_train['send_dow_bin'].astype('int')
df_test['send_dow_bin'] = pd.cut(df_test['send_dayofweek'].astype(int), bins=bins_dow, labels=range(3), include_lowest=True)
df_test['send_dow_bin'] = df_test['send_dow_bin'].astype('int')

_, bins_hour = pd.cut(pd.concat([df_train['send_hour'], df_test['send_hour']], ignore_index=True).astype(int), 3, retbins=True, labels=range(3), include_lowest=True) 
df_train['send_hour_bin'] = pd.cut(df_train['send_hour'].astype(int), bins=bins_hour, labels=range(3), include_lowest=True)
df_train['send_hour_bin'] = df_train['send_hour_bin'].astype('int')
df_test['send_hour_bin'] = pd.cut(df_test['send_hour'].astype(int), bins=bins_hour, labels=range(3), include_lowest=True)
df_test['send_hour_bin'] = df_test['send_hour_bin'].astype('int')

_, bins_day = pd.cut(pd.concat([df_train['send_day'], df_test['send_day']], ignore_index=True).astype(int), 3, retbins=True, labels=range(3), include_lowest=True) 
df_train['send_day_bin'] = pd.cut(df_train['send_day'].astype(int), bins=bins_day, labels=range(3), include_lowest=True)
df_train['send_day_bin'] = df_train['send_day_bin'].astype('int')
df_test['send_day_bin'] = pd.cut(df_test['send_day'].astype(int), bins=bins_day, labels=range(3), include_lowest=True)
df_test['send_day_bin'] = df_test['send_day_bin'].astype('int')

In [34]:
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)

In [35]:
df_all = df_all.merge(df_campaign_data, 'left', on='campaign_id')

In [36]:
df_all.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,bins_ilbins_i_2c_combined,bins_ilbins_s_2c_combined,bins_ilurl_t_2c_combined,bins_ilbins_campaign_2c_combined,bins_ibins_s_2c_combined,bins_iurl_t_2c_combined,bins_ibins_campaign_2c_combined,bins_surl_t_2c_combined,bins_sbins_campaign_2c_combined,url_tbins_campaign_2c_combined
0,42,09 19:55:00,42_14051,0.0,train,9,0,0,31,0,...,1,1,1,1,8,1,4,1,4,4
1,52,11 12:53:00,52_134438,0.0,train,11,1,5,28,2,...,1,4,1,3,2,1,3,1,6,2
2,33,24 15:15:00,33_181789,0.0,train,24,2,0,31,0,...,22,33,6,13,21,5,10,6,13,8
3,44,09 11:36:00,44_231448,0.0,train,9,0,1,31,0,...,1,2,1,3,2,1,1,1,1,4
4,29,07 18:01:00,29_185580,0.0,train,7,0,5,31,2,...,3,4,1,3,8,1,4,1,4,2


In [38]:
del df_train
del df_test
gc.collect()

568

In [37]:
%%time

combination_cols = ['c_type', 'bins_tl', 'bins_il', 'bins_i', 'bins_s', 'url_t',  'bins_campaign', 
                    'send_minute_bin', 'send_dow_bin', 'send_hour_bin', 'send_day_bin', 'user_id', 'campaign_id']
for col in combination_cols:
    df_x = df_all.groupby(col)['id'].agg(['count']).reset_index()
    df_x.columns = [col, col + '_1t_combined']
    df_all = df_all.merge(df_x, 'left', [col])

Wall time: 1min 20s


In [39]:
%%time

combination_cols = ['c_type', 'bins_tl', 'bins_il', 'bins_i', 'bins_s', 'url_t',  'bins_campaign', 
                    'send_minute_bin', 'send_dow_bin', 'send_hour_bin', 'send_day_bin', 'user_id', 'campaign_id']
for col_1 in combination_cols:
    col_1_idx = combination_cols.index(col_1)
    for col_2 in combination_cols[col_1_idx + 1:]:
        if col_1 == 'user_id' or col_1 == 'campaign_id':
            continue
        df_x = df_all.groupby([col_1, col_2])['id'].agg(['count']).reset_index()
        df_x.columns = [col_1, col_2, col_1 + col_2 + '_2t_combined']
        df_all = df_all.merge(df_x, 'left', [col_1, col_2])

Wall time: 14min 7s


In [40]:
df_all.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,send_minute_bincampaign_id_2t_combined,send_dow_binsend_hour_bin_2t_combined,send_dow_binsend_day_bin_2t_combined,send_dow_binuser_id_2t_combined,send_dow_bincampaign_id_2t_combined,send_hour_binsend_day_bin_2t_combined,send_hour_binuser_id_2t_combined,send_hour_bincampaign_id_2t_combined,send_day_binuser_id_2t_combined,send_day_bincampaign_id_2t_combined
0,42,09 19:55:00,42_14051,0.0,train,9,0,0,31,0,...,7934,426843,623788,8,81253,601258,7,81253,10,81253
1,52,11 12:53:00,52_134438,0.0,train,11,1,5,28,2,...,7307,313215,167591,3,82160,279560,4,82160,5,82160
2,33,24 15:15:00,33_181789,0.0,train,24,2,0,31,0,...,19535,264806,52426,7,46815,131389,4,32609,1,46815
3,44,09 11:36:00,44_231448,0.0,train,9,0,1,31,0,...,9990,106147,623788,3,39498,224106,2,39498,7,39498
4,29,07 18:01:00,29_185580,0.0,train,7,0,5,31,2,...,16482,198551,340054,4,40008,601258,1,69129,4,69129


# Creating Encoders

In [41]:
label_cols = [ 'c_type', 'bins_tl', 'bins_il', 'bins_i', 'bins_s', 'url_t', 'bins_campaign', 
              'send_minute_bin', 'send_dow_bin', 'send_hour_bin', 'send_day_bin']
df_all[label_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1797049 entries, 0 to 1797048
Data columns (total 11 columns):
c_type             int64
bins_tl            int32
bins_il            int32
bins_i             int32
bins_s             int32
url_t              object
bins_campaign      int32
send_minute_bin    int32
send_dow_bin       int32
send_hour_bin      int32
send_day_bin       int32
dtypes: int32(9), int64(1), object(1)
memory usage: 102.8+ MB


In [42]:
obj_label_cols = ['url_t']
df_all[obj_label_cols].head()

Unnamed: 0,url_t
0,1521180188
1,1520965166
2,1521180195
3,1521180188
4,1521180203


In [45]:
for c in obj_label_cols:
    le = LabelEncoder()
    df_all[c] = le.fit_transform(df_all[c].values)
df_all[obj_label_cols].head()

Unnamed: 0,url_t
0,14
1,9
2,16
3,14
4,18


In [46]:
mlbin = MultiLabelBinarizer()
mlbin.fit(df_all[label_cols].values)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [47]:
df_train_new = df_all[df_all['label'] == 'train']
df_test_new = df_all[df_all['label'] == 'test']

In [48]:
df_train_new.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,send_minute_bincampaign_id_2t_combined,send_dow_binsend_hour_bin_2t_combined,send_dow_binsend_day_bin_2t_combined,send_dow_binuser_id_2t_combined,send_dow_bincampaign_id_2t_combined,send_hour_binsend_day_bin_2t_combined,send_hour_binuser_id_2t_combined,send_hour_bincampaign_id_2t_combined,send_day_binuser_id_2t_combined,send_day_bincampaign_id_2t_combined
0,42,09 19:55:00,42_14051,0.0,train,9,0,0,31,0,...,7934,426843,623788,8,81253,601258,7,81253,10,81253
1,52,11 12:53:00,52_134438,0.0,train,11,1,5,28,2,...,7307,313215,167591,3,82160,279560,4,82160,5,82160
2,33,24 15:15:00,33_181789,0.0,train,24,2,0,31,0,...,19535,264806,52426,7,46815,131389,4,32609,1,46815
3,44,09 11:36:00,44_231448,0.0,train,9,0,1,31,0,...,9990,106147,623788,3,39498,224106,2,39498,7,39498
4,29,07 18:01:00,29_185580,0.0,train,7,0,5,31,2,...,16482,198551,340054,4,40008,601258,1,69129,4,69129


In [49]:
del df_all
gc.collect()

422

In [50]:
df_train_new = df_train_new.merge(df_subject_hash_data, 'left', on='campaign_id')
df_train_new = df_train_new.merge(df_email_hash_data, 'left', on='campaign_id')
df_test_new = df_test_new.merge(df_subject_hash_data, 'left', on='campaign_id')
df_test_new = df_test_new.merge(df_email_hash_data, 'left', on='campaign_id')
df_train_new.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,email_hash_15,email_hash_16,email_hash_17,email_hash_18,email_hash_19,email_hash_20,email_hash_21,email_hash_22,email_hash_23,email_hash_24
0,42,09 19:55:00,42_14051,0.0,train,9,0,0,31,0,...,0.137038,0.068519,0.068519,0.0,0.0,-0.205557,0.137038,0.274075,0.137038,0.0
1,52,11 12:53:00,52_134438,0.0,train,11,1,5,28,2,...,0.0,0.202031,0.101015,0.202031,0.202031,0.101015,0.0,0.202031,-0.303046,-0.101015
2,33,24 15:15:00,33_181789,0.0,train,24,2,0,31,0,...,0.337526,0.225018,0.0,0.225018,-0.337526,0.225018,0.225018,0.0,-0.112509,0.0
3,44,09 11:36:00,44_231448,0.0,train,9,0,1,31,0,...,0.0,-0.158114,0.158114,0.0,0.316228,0.158114,0.0,-0.316228,0.316228,0.0
4,29,07 18:01:00,29_185580,0.0,train,7,0,5,31,2,...,0.150329,0.075165,0.225494,0.300658,-0.075165,-0.150329,0.075165,0.0,-0.450988,-0.300658


In [51]:
del df_subject_hash_data
del df_email_hash_data
gc.collect()

178

# Undersampling dataset

In [52]:
X_sampled, y_sampled = df_train_new['campaign_id'].values, df_train_new['is_click'].values
X_sampled = np.expand_dims(X_sampled, axis=-1)
rus = RandomUnderSampler(return_indices=True, random_state=0)
_, _, idx_resampled = rus.fit_sample(X_sampled, y_sampled)

In [53]:
df_train_sampled = df_train_new.iloc[idx_resampled].reset_index(drop=True)
df_train_sampled.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,email_hash_15,email_hash_16,email_hash_17,email_hash_18,email_hash_19,email_hash_20,email_hash_21,email_hash_22,email_hash_23,email_hash_24
0,54,12 20:19:00,54_110682,0.0,train,12,1,3,31,1,...,0.105409,0.210819,0.105409,0.210819,0.105409,0.105409,0.105409,0.0,-0.105409,0.0
1,33,24 14:51:00,33_239106,0.0,train,24,2,0,31,0,...,0.337526,0.225018,0.0,0.225018,-0.337526,0.225018,0.225018,0.0,-0.112509,0.0
2,29,07 18:05:00,29_210017,0.0,train,7,0,5,31,2,...,0.150329,0.075165,0.225494,0.300658,-0.075165,-0.150329,0.075165,0.0,-0.450988,-0.300658
3,53,11 22:52:00,53_167235,0.0,train,11,1,6,30,2,...,0.460179,0.0,-0.153393,0.0,0.076696,-0.153393,0.076696,0.536875,-0.153393,-0.153393
4,49,28 15:24:00,49_189075,0.0,train,28,2,3,30,1,...,0.248069,0.062017,-0.062017,0.0,-0.124035,-0.372104,0.0,0.062017,0.186052,-0.248069


In [54]:
combined_cols = [col for col in df_train_sampled.columns if '_combined' in col]
hash_cols = [col for col in df_train_sampled.columns if '_hash_' in col]
tf_cols = [col for col in df_train_sampled.columns if '_tf_' in col]
float_cols = ['total_links', 'no_of_internal_links', 'no_of_images', 'no_of_sections', 'content_mean', 'content_std']

In [55]:
%%time

combined_mms = MinMaxScaler()
df_train_sampled[combined_cols + hash_cols + tf_cols + float_cols] = combined_mms.fit_transform(df_train_sampled[combined_cols + hash_cols + tf_cols + float_cols].values)

Wall time: 2.13 s


In [56]:
df_train_sampled.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,email_hash_15,email_hash_16,email_hash_17,email_hash_18,email_hash_19,email_hash_20,email_hash_21,email_hash_22,email_hash_23,email_hash_24
0,54,12 20:19:00,54_110682,0.0,train,12,1,3,31,1,...,0.224316,0.706546,0.542873,0.849992,0.692537,0.656785,0.410273,0.349587,0.423432,0.366025
1,33,24 14:51:00,33_239106,0.0,train,24,2,0,31,0,...,0.542131,0.727577,0.395484,0.868738,0.168216,0.821298,0.641735,0.349587,0.414733,0.366025
2,29,07 18:05:00,29_210017,0.0,train,7,0,5,31,2,...,0.285821,0.505622,0.710781,0.9686,0.478784,0.305035,0.351745,0.349587,0.0,0.123045
3,53,11 22:52:00,53_167235,0.0,train,11,1,6,30,2,...,0.710067,0.394292,0.181003,0.571667,0.658548,0.300821,0.35471,0.863582,0.364639,0.242059
4,49,28 15:24:00,49_189075,0.0,train,28,2,3,30,1,...,0.419647,0.486149,0.308769,0.571667,0.420935,0.0,0.206289,0.408961,0.780556,0.165545


In [57]:
df_mlbin = pd.DataFrame([tuple(x) for x in mlbin.transform(df_train_sampled[label_cols].values)], columns=['mlbin_' + str(l) for l in mlbin.classes_])
df_mlbin.head()

Unnamed: 0,mlbin_0,mlbin_1,mlbin_2,mlbin_3,mlbin_4,mlbin_5,mlbin_6,mlbin_7,mlbin_8,mlbin_9,mlbin_10,mlbin_11,mlbin_12,mlbin_13,mlbin_14,mlbin_15,mlbin_16,mlbin_17,mlbin_18,mlbin_19
0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [58]:
df_train_sampled = pd.concat([df_train_sampled, df_mlbin], axis=1)

In [59]:
df_train_sampled.head()

Unnamed: 0,campaign_id,day_time,id,is_click,label,send_day,send_day_bin,send_dayofweek,send_daysinmonth,send_dow_bin,...,mlbin_10,mlbin_11,mlbin_12,mlbin_13,mlbin_14,mlbin_15,mlbin_16,mlbin_17,mlbin_18,mlbin_19
0,54,12 20:19:00,54_110682,0.0,train,12,1,3,31,1,...,0,0,0,0,0,0,0,0,0,0
1,33,24 14:51:00,33_239106,0.0,train,24,2,0,31,0,...,0,0,0,0,0,0,1,1,0,0
2,29,07 18:05:00,29_210017,0.0,train,7,0,5,31,2,...,0,0,0,0,0,0,0,0,1,0
3,53,11 22:52:00,53_167235,0.0,train,11,1,6,30,2,...,0,0,0,0,0,0,0,1,0,0
4,49,28 15:24:00,49_189075,0.0,train,28,2,3,30,1,...,0,0,0,0,0,0,0,0,0,1


In [60]:
mlbin_cols = [col for col in df_train_sampled.columns if 'mlbin_' in col]
merge_cols = combined_cols + hash_cols + tf_cols + float_cols + mlbin_cols + ['is_click']
df_train_sampled[merge_cols].head()

Unnamed: 0,c_type_1c_combined,bins_tl_1c_combined,bins_il_1c_combined,bins_i_1c_combined,bins_s_1c_combined,url_t_1c_combined,bins_campaign_1c_combined,c_typebins_tl_2c_combined,c_typebins_il_2c_combined,c_typebins_i_2c_combined,...,mlbin_11,mlbin_12,mlbin_13,mlbin_14,mlbin_15,mlbin_16,mlbin_17,mlbin_18,mlbin_19,is_click
0,0.666667,0.060606,0.151515,0.368421,0.233333,0.0,0.0,0.0,0.333333,0.0,...,0,0,0,0,0,0,0,0,0,0.0
1,0.083333,1.0,1.0,1.0,1.0,1.0,1.0,0.083333,0.083333,0.083333,...,0,0,0,0,0,1,1,0,0,0.0
2,0.666667,0.090909,0.151515,0.578947,0.233333,0.142857,1.0,0.25,0.333333,0.416667,...,0,0,0,0,0,0,0,1,0,0.0
3,0.583333,0.030303,0.060606,0.578947,1.0,0.142857,0.0,0.0,0.083333,0.083333,...,0,0,0,0,0,0,1,0,0,0.0
4,0.583333,0.030303,0.060606,0.0,1.0,0.0,0.0,0.0,0.083333,0.0,...,0,0,0,0,0,0,0,0,1,0.0


# The train-test split

In [61]:
input_cols = [c for c in df_train_sampled[merge_cols].columns if c not in ['is_click']]
train_X, test_X, train_y, test_y = train_test_split(df_train_sampled[input_cols], df_train_sampled['is_click'], train_size=0.8, test_size=0.2, shuffle=True, random_state=0)
print('train_X.shape', train_X.shape, 'train_y.shape', train_y.shape, 'test_X.shape', test_X.shape, 'test_y.shape', test_y.shape)

train_X.shape (20451, 175) train_y.shape (20451,) test_X.shape (5113, 175) test_y.shape (5113,)


In [62]:
X, y, tX, ty = train_X.values, train_y.values, test_X.values, test_y.values

# Xgboost Classifier

In [65]:
warnings.simplefilter(action='ignore', category=DeprecationWarning)
xgb = xgboost.XGBClassifier(n_jobs=4)
xgb.fit(X, y)
roc_auc_score(ty, xgb.predict(tX))

0.6106156428196023

# Hyper-parameters optimization using hyperopt

In [69]:
warnings.simplefilter(action='ignore', category=DeprecationWarning)
def objective(space):
    clf = xgboost.XGBClassifier(max_depth=int(space['max_depth']), min_child_weight=int(space['min_child_weight']),
                                subsample=space['subsample'], gamma=space['gamma'], max_delta_step=space['max_delta_step'],
                                colsample_bytree=space['colsample_bytree'], learning_rate=space['learning_rate'], n_jobs=4)
    clf.fit(X, y, eval_set=[(tX, ty)], eval_metric="auc", early_stopping_rounds=30, verbose=False)
    auc = roc_auc_score(ty, clf.predict(tX))
    return{'loss':1-auc, 'status': STATUS_OK }

In [70]:
space ={'max_depth': hp.quniform("x_max_depth", 3, 30, 1),
        'min_child_weight': hp.quniform ('x_min_child', 1, 10, 1),
        'subsample': hp.uniform('x_subsample', 0.5, 1.0),
        'gamma': hp.uniform('x_gamma', 0.0, 10.0),
        'max_delta_step': hp.randint('x_max_delta_step', 10),
        'colsample_bytree': hp.uniform('x_colsample_bytree', 0.5, 1.0),
        'learning_rate': hp.uniform('x_learning_rate ', 0.001, 1.0)}

In [71]:
%%time

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)
print(best)

{'x_colsample_bytree': 0.8040713297251434, 'x_gamma': 8.046231704620912, 'x_learning_rate ': 0.1332994462242539, 'x_max_delta_step': 3, 'x_max_depth': 28.0, 'x_min_child': 5.0, 'x_subsample': 0.6755258359169591}
Wall time: 5min


In [72]:
xgb = xgboost.XGBClassifier(max_depth=28, min_child_weight=5, subsample=0.6755258359169591, gamma=8.046231704620912, 
                            max_delta_step=3, colsample_bytree=0.8040713297251434, learning_rate=0.1332994462242539, n_jobs=4)
xgb.fit(X, y, eval_set=[(tX, ty)], eval_metric="auc", verbose=False, early_stopping_rounds=30)
roc_auc_score(ty, xgb.predict(tX))

0.6179048731489959

# Creating submission.csv

In [86]:
warnings.simplefilter(action='ignore', category=DeprecationWarning)
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

submission_predictions = []
for df_chunk in chunker(df_test_new, 20000):
    df_chunk_clone = df_chunk.copy(deep=True)
    df_chunk_clone[combined_cols + hash_cols + tf_cols + float_cols] = combined_mms.transform(df_chunk_clone[combined_cols + hash_cols + tf_cols + float_cols].values)
    df_mlbin_test = pd.DataFrame([tuple(x) for x in mlbin.transform(df_chunk_clone[label_cols].values)], columns=['mlbin_' + str(l) for l in mlbin.classes_])
    df_mlbin_test.index = df_chunk_clone.index
    df_chunk_clone = pd.concat([df_chunk_clone, df_mlbin_test], axis=1)
    predictions = xgb.predict(df_chunk_clone[input_cols].values)
    submission_predictions.extend(predictions.tolist())
submission_predictions = np.array(submission_predictions).astype('int')

In [87]:
def deploy_submission(y, output='data/submission.csv'):
    submission = pd.DataFrame({
        'id': df_test_new.id.values,
        'is_click': y.reshape(-1)
    })
    submission.to_csv(output, index=False)

In [88]:
deploy_submission(submission_predictions)