In [3]:
# SciPy imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
import pickle
import datetime
import random

# Machine Learning related
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
# more models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Other necessary imports
import sqlite3
import ast
from collections import defaultdict
import pprint
from pandas.io.json import json_normalize
import json
import time

# just to eliminate some annoying sklearn warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

Opened database successfully


#### Load Train/Val/Test Data From Pickle

In [5]:
train_raw = pd.read_pickle(open('pickle/train_raw.pkl.gzip', 'rb'), compression='gzip')
test_raw = pd.read_pickle(open('pickle/test_raw.pkl.gzip', 'rb'), compression='gzip')

In [6]:
train_val_dict = pickle.load(open('pickle/train_val_dict.pkl', 'rb'))

# Clean Up Features

In [7]:
## Matching Search Params

# extract Search Params numbers for search
def s_params_keys(string):
    try:
        d = ast.literal_eval(string)
        return set(x for x in d.keys())
    except:
        return set('s')
# extract Ad Params numbers from Ad
def a_params_keys(string):
    try:
        d = ast.literal_eval(string)
        return set(x for x in d.keys())
    except:
        return set('a')

# compare set of Search Params & Ad Params and get count of intersection
def get_common_params(df):
    
    # TODO: try to find an alternative for apply function cuz it'sa slooow
    s_params_set = df['SearchParams'].apply(s_params_keys)
    a_params_set = df['Params'].apply(a_params_keys)
    common_params_cnt = s_params_set.values & a_params_set.values
    len_vectorized = np.frompyfunc(len, 1, 1)
    return len_vectorized(common_params_cnt)

# checks to see if CategoryID for Search and Ad match for a given impression
def matched_categories(df):
    
    categories_match = df['CategoryID_s'] == df['CategoryID_a']
    return categories_match.astype(int)

# checks to see if Search Query was left blank for a given Search
def query_blank(df):
    
    blank_query = (df['SearchQuery'] == '').astype(int)
    return blank_query

# cast Price column as numeric and replace empty values with NaNs
def make_price_numeric(df):
    price = df['Price'].replace('', np.NaN)
    df.astype({'Price': 'float'})
    return price

In [8]:
# consolidates all the above preprocessing functions
def basic_cleaning(df):
    # convert SearchDate to datetime
    df['SearchDate'] = pd.to_datetime(df['SearchDate'])
    
    # get the common parameters between SearchID and AdID
    df['common_params_cnt'] = get_common_params(df)
    print('Finished get_common_params')
    
    # do the categories match between Search and Ad?
    df['categories_match'] = matched_categories(df)
    # replace blank categories with arbitrary negative number
    df['CategoryID_s'] = df['CategoryID_s'].replace('', -1)
    df['CategoryID_a'] = df['CategoryID_a'].replace('', -2)
    print('Finished matched_categories')
    
    # was the search query blank?
    df['blank_query'] = query_blank(df)
    print('Finished query_blank')
    
    # convert the price column to float and fill '' with NaN
    df['Price'] = make_price_numeric(df)
    print('Finished make_price_numeric')
    
    # drop columns that are now irrelevant
    df.drop(columns=['Params', 'SearchParams', 'SearchQuery', 'IsContext'], inplace=True)
    
    # return DataFrame with mentioned columns as numeric
    return df.astype({'Price': 'float', 'common_params_cnt': 'int', 
                      'CategoryID_s': 'int', 'CategoryID_a': 'int'})

### Apply to Train & Test sets

In [121]:
%time train_raw = basic_cleaning(train_raw)

Finished get_common_params
Finished matched_categories
Finished query_blank
Finished make_price_numeric
CPU times: user 10min 11s, sys: 7min 38s, total: 17min 49s
Wall time: 20min 37s


In [123]:
%time test_raw = basic_cleaning(test_raw)

Finished get_common_params
Finished matched_categories
Finished query_blank
Finished make_price_numeric
CPU times: user 2min 33s, sys: 16 s, total: 2min 49s
Wall time: 2min 57s


In [127]:
# preview output
train_raw.head()

Unnamed: 0,SearchID,AdID,UserID,SearchDate,Price,Position,title_length,IsClick,IsUserLoggedOn,CategoryID_s,CategoryID_a,row_n,common_params_cnt,categories_match,blank_query
0,10,13524889,3310798,2015-05-15 17:38:46,5000.0,1,38,0,0,34,12,5,0,0,1
1,10,34084553,3310798,2015-05-15 17:38:46,30000.0,7,12,0,0,34,12,5,0,0,1
2,23,12281759,3524741,2015-05-13 09:58:44,40086.0,1,40,0,0,38,38,5,1,1,1
3,25,29614929,4114490,2015-05-15 09:56:17,9829.0,1,47,0,0,47,47,5,0,1,0
4,37,19880051,3314799,2015-05-18 08:46:10,17100.0,7,28,0,0,11,11,3,0,1,1


### You can pickle that!

In [9]:
train_raw.to_pickle(open('pickle/train_processed.pkl.gzip', 'wb'), compression='gzip')
del train_raw

In [10]:
test_raw.to_pickle(open('pickle/test_processed.pkl.gzip', 'wb'), compression='gzip')
del test_raw

In [11]:
# reassign pickles
# also you can restart from here if your notebook kernel dies for whatever reason
train_processed = pd.read_pickle(open('pickle/train_processed.pkl.gzip', 'rb'), compression='gzip')
test_processed = pd.read_pickle(open('pickle/test_processed.pkl.gzip', 'rb'), compression='gzip')

### \*If you think you might want to submit to Kaggle 

In [12]:
def get_kaggle_test():
    
    # query database for kaggle test set
    q = '''
        SELECT TestID, SearchID, AdID, UserID, SearchDate,
               Price, Position, LENGTH(Title) title_length, 
               IsContext, IsUserLoggedOn,
               CategoryID CategoryID_s,
               "CategoryID:1" CategoryID_a,
               Params, SearchParams, SearchQuery
        FROM test_merged
        '''
    kaggle = pd.read_sql_query(q, conn)
    # apply basic cleaning functions to kaggle DataFrame
    kaggle = basic_cleaning(kaggle)
    
    return kaggle

In [252]:
%time kaggle = get_kaggle_test()

Finished get_common_params
Finished matched_categories
Finished query_blank
Finished make_price_numeric
CPU times: user 6min 47s, sys: 3min 55s, total: 10min 43s
Wall time: 13min 31s


### You can pickle that!

In [13]:
# pickle and delete variable to save memory for now
kaggle.to_pickle(open('pickle/kaggle_processed.pkl.gzip', 'wb'), compression='gzip')
del kaggle

# Feature Engineering I

### Aggregating Historical Data

In [54]:
## aggregates User Features for Searches preceding last7searches
hist_user_agg = '''
CREATE TABLE hist_user_agg AS

WITH historical_impressions AS
(SELECT m.SearchID, AdID, UserID, IsClick, IsContext, CategoryID CategoryID_s
FROM last10days_merged m
WHERE m.SearchID IN
    (SELECT SearchID 
     FROM hist_searches)
)

SELECT a.UserID,
       user_total_searches,
       user_category_counts,
       user_total_clicks,
       user_total_impressions 

FROM (  SELECT UserID,
               COUNT(SearchID) user_total_searches,
               COUNT(DISTINCT(CategoryID_s)) user_category_counts,
               SUM(IsClick) user_total_clicks
        FROM historical_impressions
        GROUP BY UserID
        ) AS a

JOIN ( SELECT UserID,
               COUNT(AdID) user_total_impressions
        FROM historical_impressions
        WHERE IsContext = 1
        GROUP BY UserID 
        ) AS b
    ON a.UserID = b.UserID
ORDER BY a.UserID
;
'''
%time cursor.executescript(hist_user_agg)

CPU times: user 7min 46s, sys: 8min 52s, total: 16min 39s
Wall time: 23min 55s


<sqlite3.Cursor at 0x1a345648f0>

In [55]:
## sanity check to make sure no Users got duplicated - these queries should all return the same # of counts
q1 = '''
SELECT (COUNT(DISTINCT(UserID))) FROM hist_user_agg;
'''

q2 = '''
SELECT COUNT(*) FROM hist_user_agg;
'''

q3 = '''
WITH hist_imp AS
(SELECT m.SearchID, AdID, m.UserID, IsClick, IsContext, CategoryID CategoryID_s
FROM last10days_merged m
WHERE m.SearchID IN
    (SELECT SearchID 
     FROM hist_searches)
)

SELECT COUNT(DISTINCT(UserID)) FROM hist_imp;
'''

for q in [q1, q2, q3]:
    cursor.execute(q)
    print('Count = ', cursor.fetchone()[0])

Count =  993106
Count =  993106
Count =  993106


In [56]:
## aggregates Category Features for Searches 

hist_cat_agg = '''
CREATE TABLE hist_cat_agg_daily AS

WITH hist_imp AS
(SELECT m.SearchID, AdID, SearchDate, IsClick, IsContext, Price, CategoryID CategoryID_s
FROM last10days_merged m
)

SELECT a.CategoryID_s,
       a.search_date,
       cat_total_searches,
       cat_mean_price,
       cat_total_clicks,
       cat_total_impressions
        
FROM (
    SELECT CategoryID_s,
           DATE(SearchDate) search_date,
           COUNT(DISTINCT(SearchID)) cat_total_searches,
           AVG(Price) cat_mean_price
    FROM hist_imp
    GROUP BY CategoryID_s, Date(SearchDate)
    ORDER BY CategoryID_s
     ) AS a
     
JOIN (
    SELECT CategoryID_s, 
           DATE(SearchDate) search_date,
           SUM(IsClick) cat_total_clicks,
           COUNT(AdID) cat_total_impressions
    FROM hist_imp
    WHERE IsContext = 1
    GROUP BY CategoryID_s,  Date(SearchDate)
     ) AS b
ON a.CategoryID_s = b.CategoryID_s
AND a.search_date = b.search_date
;
'''
%time cursor.executescript(hist_cat_agg)

CPU times: user 10min 22s, sys: 6min 14s, total: 16min 36s
Wall time: 25min 38s


<sqlite3.Cursor at 0x1a345648f0>

In [57]:
## sanity check to make sure there are no duplicate CategoryIDs

q1 = '''
SELECT (COUNT(DISTINCT(CategoryID_s))) FROM hist_cat_agg_daily;
'''

q2 = '''
WITH hist_imp AS
(SELECT m.SearchID, AdID, IsClick, IsContext, Price, CategoryID CategoryID_s
FROM last10days_merged m
WHERE m.SearchID IN
    (SELECT SearchID 
     FROM hist_searches)
     AND IsContext=1
)

SELECT COUNT(DISTINCT(CategoryID_s)) FROM hist_imp;
'''

for q in [q1, q2]:
    cursor.execute(q)
    print('Count = ', cursor.fetchone()[0])

Count =  32
Count =  32


In [58]:
## aggregates Ad Features for Searches preceding last7searches
## note: only for Context Ads
hist_ad_agg = '''
CREATE TABLE hist_ad_agg_daily AS

WITH hist_imp AS (
SELECT SearchID, SearchDate, AdID, Title, 
       IsClick, IsContext, CategoryID CategoryID_s
FROM last10days_merged 
)

SELECT
    AdID,
    DATE(SearchDate) search_date,
    COUNT(SearchID) ad_total_impressions,
    SUM(IsClick) ad_total_clicks

FROM hist_imp
WHERE IsContext = 1
GROUP BY AdID, Date(SearchDate)
ORDER BY AdID
;
'''
%time cursor.executescript(hist_ad_agg)

CPU times: user 3min 23s, sys: 3min 37s, total: 7min 1s
Wall time: 12min 47s


<sqlite3.Cursor at 0x1a345648f0>

In [59]:
## sanity check to ensure no duplicate AdIDs

q1 = '''
SELECT (COUNT(DISTINCT(AdID))) FROM hist_ad_agg_daily;
'''

q2 = '''
SELECT COUNT(DISTINCT(AdID)) 
FROM last10days_merged
WHERE IsContext=1;
'''
for q in [q1, q2]:
    cursor.execute(q)
    print('Count = ', cursor.fetchone()[0])

Count =  27991
Count =  27991


In [60]:
## aggregates User-Ad Features for Searches preceding last7searches
## note: only for Context Ads
hist_userad_agg = '''
CREATE TABLE hist_userAd_agg AS

WITH hist_imp AS (
SELECT m.SearchID, AdID, UserID, IsClick, 
       IsContext, CategoryID CategoryID_s
FROM last10days_merged m
WHERE m.SearchID IN (
     SELECT SearchID 
     FROM hist_searches
                    )
)

SELECT 
    UserID,
    AdID,
    COUNT(*) times_user_has_seen_ad
FROM hist_imp

WHERE IsContext = 1
GROUP BY UserID, AdID
ORDER BY UserID, AdID
;
'''
%time cursor.executescript(hist_userad_agg)

CPU times: user 3min 51s, sys: 4min 44s, total: 8min 36s
Wall time: 12min 35s


<sqlite3.Cursor at 0x1a345648f0>

In [61]:
## user agg data from Landing Page Visits
user_agg_visits = '''
CREATE TABLE hist_visits_before_tr AS

--selecting how many landing pages
--a given User has visited
SELECT hv.UserID, COUNT(hv.AdID) 'hist_user_total_visits'
FROM hist_visits hv

--join the last searchdate for each user prior to last 7 searches
INNER JOIN (
    SELECT UserID, MAX(SearchDate) latest_search
    FROM hist_searches
    GROUP BY UserID) AS maxSD
ON hv.UserID = maxSD.UserID

--only count the landing page views that precede 
--the user's last search
WHERE hv.ViewDate < maxSD.latest_search
GROUP BY hv.UserID; 
'''
%time cursor.executescript(user_agg_visits)

CPU times: user 3min 17s, sys: 4min 25s, total: 7min 43s
Wall time: 11min 36s


<sqlite3.Cursor at 0x1a345648f0>

# Feature Engineering II

### Aggregating on User

In [14]:
def user_agg_processing(df_to_agg=None):
    # pull hist_agg_set from sql
    q = '''SELECT UserID, user_total_searches, user_category_counts,
                 user_total_clicks, user_total_impressions
           FROM hist_user_agg;'''
    hist_agg_set = pd.read_sql_query(q, conn)
    
    # get all VisitsStream data from datetime prior to User's last 7 searches
    q = '''SELECT UserID, hist_user_total_visits 
           FROM hist_visits_before_tr;'''
    hist_user_visits = pd.read_sql_query(q, conn)
    
    # if there is no DataFrame to additionally aggregate, just merge the queries above
    if df_to_agg is None:
        # merge aggregated historical visits stream data
        user_agg = hist_agg_set.merge(hist_user_visits, how='left', on='UserID')
        
        # the count of distinct categories a User has searched for is a proxy for how diverse
        # their searches are
        user_agg.rename(columns={'user_category_counts': 'user_cat_diversity'}, 
                                                                           inplace=True)
        user_agg.fillna(0, inplace=True)
        user_agg = user_agg.astype(int)
        user_agg['user_HCTR'] = user_agg['user_total_clicks'] / user_agg['user_total_impressions']
        return user_agg
    
    #process agg_set
    user_agg = df_to_agg.groupby('UserID', as_index=False)\
                                   .agg({'SearchID': 'nunique', 
                                         'CategoryID_s': 'nunique',
                                         'IsClick':'sum',
                                         'AdID': 'count'})\
                                   .rename(columns={'SearchID':'user_total_searches', 
                                                    'CategoryID_s': 'user_category_count',
                                                    'IsClick': 'user_total_clicks',
                                                    'AdID': 'user_total_impressions'})
    
    # merge historical aggregation sets
    user_agg = user_agg.merge(hist_user_visits, how='left', on='UserID')
    user_agg = user_agg.merge(hist_agg_set, how='left', on='UserID', suffixes=('','_x'))
    user_agg = user_agg.fillna(0)
    
    # sum up values from duplicate columns
    user_agg['user_total_searches'] = \
                user_agg['user_total_searches'] + user_agg['user_total_searches_x']
    user_agg['user_cat_diversity'] = \
                user_agg['user_category_count'] + user_agg['user_category_counts']
    user_agg['user_total_impressions'] = \
                user_agg['user_total_impressions'] + user_agg['user_total_impressions_x']
    user_agg['user_total_clicks'] = \
                user_agg['user_total_clicks'] + user_agg['user_total_clicks_x']
    # cast datatypes as integers
    user_agg = user_agg.astype(int)
    
    # create historical CTR column for User
    user_agg['user_HCTR'] = \
                user_agg['user_total_clicks'] / user_agg['user_total_impressions']
    
    # drop duplicate columns from merge
    drop = ['user_total_searches_x',
            'user_category_count',
            'user_category_counts',
            'user_total_impressions_x',
            'user_total_clicks_x']
    user_agg.drop(drop, axis=1, inplace=True)
    
    return user_agg

### Aggregating on Search Category

In [15]:
def cat_agg_processing(train_df):
    # query
    # pull hist_agg_set from sql
    q= '''SELECT CategoryID_s, search_date, cat_total_searches,
                 cat_mean_price, cat_total_clicks, cat_total_impressions 
          FROM hist_cat_agg_daily;'''
    hist_agg_set = pd.read_sql_query(q, conn)
    # convert to datetime
    hist_agg_set['search_date'] = pd.to_datetime(hist_agg_set['search_date'])
    
    # merge historical aggregations to training set
    cat_agg = train_df.merge(hist_agg_set, on='CategoryID_s')

    # filter out rows where historical aggregations are a later date than training set impression
    mask = cat_agg['SearchDate'].dt.date > cat_agg['search_date'].dt.date
    
    # group aggregates
    cat_agg = cat_agg[mask].groupby(['SearchID', 'AdID'], as_index=False)\
                           .agg({'cat_total_searches' : 'sum',
                                 'cat_mean_price' : 'mean',
                                 'cat_total_clicks' : 'sum',
                                 'cat_total_impressions' : 'sum'})
    
    # create resultant historical CTR column
    cat_agg['cat_HCTR'] = \
                cat_agg['cat_total_clicks'] / cat_agg['cat_total_impressions']
    
    return cat_agg

### Aggregating on AdID

In [16]:
def ad_agg_processing(train_df):
    # query
    # pull ad_agg_set from sql
    q= '''SELECT AdID, search_date, ad_total_impressions,
                 ad_total_clicks 
          FROM hist_ad_agg_daily;'''
    hist_agg_set = pd.read_sql_query(q, conn)
    
    # convert to datetime
    hist_agg_set['search_date'] = pd.to_datetime(hist_agg_set['search_date'])

    # merge historical aggregations to training set
    ad_agg = train_df.merge(hist_agg_set, on='AdID')
    
    # filter out rows where historical aggregations are a later date than training set impression
    mask = ad_agg['SearchDate'].dt.date > ad_agg['search_date'].dt.date
    
    # group aggregates
    ad_agg = ad_agg[mask].groupby(['SearchID', 'AdID'], as_index=False)\
                         .agg({'ad_total_impressions': 'sum', 
                               'ad_total_clicks': 'sum'})
    
    # create resultant historical CTR column                               
    ad_agg['ad_HCTR'] = \
                ad_agg['ad_total_clicks'] / ad_agg['ad_total_impressions']
    
    return ad_agg

### Aggregating on User-Ad Interactions

In [17]:
def userAd_agg_processing(df_to_agg=None):
    # pull agg_set from sql
    q= '''SELECT UserID, AdID, times_user_has_seen_ad 
          FROM hist_userAd_agg;'''
    hist_agg_set = pd.read_sql_query(q, conn)
    
    # if there is no DataFrame to additionally aggregate, just return the query above
    if df_to_agg is None:
        return hist_agg_set.astype(int)
    
    # process agg_set
    userAd_agg = df_to_agg.groupby(['UserID', 'AdID'], as_index=False)\
                                   .agg({'SearchID': 'count'})\
                                   .rename(columns={'SearchID':'times_user_has_seen_ad'})
    
    # merge historical aggregation sets
    userAd_agg = userAd_agg.merge(hist_agg_set, how='left', 
                                  on=['UserID','AdID'], suffixes=('','_x'))
    userAd_agg = userAd_agg.fillna(0)
    
    # sum up values from duplicate columns
    userAd_agg['times_user_has_seen_ad'] = \
            (userAd_agg['times_user_has_seen_ad'] + userAd_agg['times_user_has_seen_ad_x']).astype(int)
    
    # drop duplicate columns from merge
    drop = ['times_user_has_seen_ad_x']
    userAd_agg.drop(drop, axis=1, inplace=True)
    
    return userAd_agg

#### Wrapper

In [18]:
def merge_preprocessed_data(train_set, agg_set=None):
    '''
    Wrapper function to perform all aggregate preprocessing functions onto a dataset.
    Effectively adds historical aggregate features to a set of observations.
    
    Parameters
    ----------
    train_set : DataFrame
        Observations on which to merge historical agg_set data.
        
    agg_set : DataFrame, default None
        Observations to aggregate and merge with SQL historical agg data.
        If None, then processing functions will just yield the SQL aggregates,
        and merge those to the DataFrame specified by 'train_set'.
    
    Returns
    -------
    train_set 
        Updated training set with all historical aggregate feature columns included.
        Aggregate features comprise all searches prior to 'row_n' of train_set.
        
        e.g. if train_set passed was the second-to-last search for all Users, 
        then the 3rd-to-last and prior searches are aggregated together and merged.
    '''
    # perform all aggregate functions
    # columns variables are specified so that later we can fill any NaNs with 0,
    # if left merge resulted in any Nulls
    user_agg = user_agg_processing(agg_set)
    columns_user = list(user_agg.columns)
    
    cat_agg = cat_agg_processing(train_set)
    columns_cat = list(cat_agg.columns)
    
    ad_agg = ad_agg_processing(train_set)
    columns_ad = list(ad_agg.columns)
    
    user_ad_agg = userAd_agg_processing(agg_set)
    columns_user_ad = list(user_ad_agg.columns)
    
    # merge aggregate DataFrames onto the set of observations, i.e. 'train_set'
    train_set = train_set.merge(user_agg, how= 'left', on='UserID')
    train_set[columns_user] = train_set[columns_user].fillna(0)
    
    # for all the merges below, it's necessary to merge on the level 
    # of an *impression* (SearchID+AdID) or (UserID+AdID)
    train_set = train_set.merge(cat_agg, how='left', on=['SearchID', 'AdID']) 
    train_set[columns_cat] = train_set[columns_cat].fillna(0)
    
    train_set = train_set.merge(ad_agg, how='left', on=['SearchID', 'AdID'])
    train_set[columns_ad] = train_set[columns_ad].fillna(0)
    
    train_set = train_set.merge(user_ad_agg, how='left', on=['UserID', 'AdID'])
    train_set[columns_user_ad] = train_set[columns_user_ad].fillna(0)
    
    return train_set

# Modeling

In [19]:
# review of train_processed DataFrame (7th-to-last to 2nd-to-last Searches Per User)
train_processed.head()

Unnamed: 0,SearchID,AdID,UserID,SearchDate,Price,Position,title_length,IsClick,IsUserLoggedOn,CategoryID_s,CategoryID_a,row_n,common_params_cnt,categories_match,blank_query
0,10,13524889,3310798,2015-05-15 17:38:46,5000.0,1,38,0,0,34,12,5,0,0,1
1,10,34084553,3310798,2015-05-15 17:38:46,30000.0,7,12,0,0,34,12,5,0,0,1
2,23,12281759,3524741,2015-05-13 09:58:44,40086.0,1,40,0,0,38,38,5,1,1,1
3,25,29614929,4114490,2015-05-15 09:56:17,9829.0,1,47,0,0,47,47,5,0,1,0
4,37,19880051,3314799,2015-05-18 08:46:10,17100.0,7,28,0,0,11,11,3,0,1,1


### Configure Features and Scalers/Encoders

In [20]:
# features to use, separated by type
# Note: categorical features should be binned together, since these will feed a One-Hot-Encoder
features_ = {'basic_features' : ['Price', 'title_length',  'common_params_cnt'],
                    
             'user_features' : ['user_total_searches', 'user_cat_diversity', 'user_total_impressions', 
                                       'user_total_clicks', 'hist_user_total_visits', 'user_HCTR'],

             'cat_features' : ['cat_total_searches', 'cat_mean_price', 'cat_total_clicks', 
                                      'cat_total_impressions', 'cat_HCTR'],

             'ad_features' : ['ad_total_impressions', 'ad_total_clicks', 'ad_HCTR'],

             'user_ad_features' : ['times_user_has_seen_ad'],

             'categorical' : ['Position', 'CategoryID_a', 'IsUserLoggedOn', 'blank_query', 'categories_match']}

# concatenate features
features = features_['basic_features'] + features_['categorical'] + features_['user_features'] + \
           features_['cat_features'] + features_['ad_features'] + features_['user_ad_features']

**Experiment with including/excluding some of these features by removing individual columns from the dictionary, or subsets from the concatenation statement.**

If you want to reset to default features you can load the dictionary again from the cell below:

In [None]:
# uncomment below for reset features_ variable to default

# with open('default_features.json', 'r') as file:
#     features_ = json.loads(file)
#     features = features_['basic_features'] + features_['categorical'] + features_['user_features'] + \
#                features_['cat_features'] + features_['ad_features'] + features_['user_ad_features']

In [22]:
# set up StandardScaler and OneHotEncoder pipeline for feature scaling/encoding
preprocess = make_column_transformer((StandardScaler(), features),
                                     (OneHotEncoder(categories='auto', 
                                                   handle_unknown='ignore'), features_['categorical'])
                                    )

### Model Validation

#### Setup

In [46]:
def validate_models(train_df, train_val_dict, features, n_fold, models, preprocessor=preprocess, **kwargs):
    
    # dictionary to save auc scores per model per round
    scores = defaultdict(list)
    for i in range(1, n_fold+1):
        # get indices for splits
        agg_v_index = train_val_dict[f'agg_set_v{i}']
        t_index = train_val_dict[f'train_set_{i}']
        v_index = train_val_dict[f'val_set_{i}']
        
        # filter full train_df for splits
        agg_set_v = train_df.iloc[agg_v_index]
        train_set = train_df.iloc[t_index]
        val_set = train_df.iloc[v_index]
        
        # merge aggregated data
        # Note: in Round 1, there won't be any agg_set for training
        if i == 1:
            t = merge_preprocessed_data(train_set)
            v = merge_preprocessed_data(val_set, agg_set_v)
        # all other rounds will include agg_set_t in merge_preprocessed step on 't'
        else:
            agg_t_index = train_val_dict[f'agg_set_t{i}']
            agg_set_t = train_df.iloc[agg_t_index]
            t = merge_preprocessed_data(train_set, agg_set_t)
            v = merge_preprocessed_data(val_set, agg_set_v)
            
        # split train into features(X) and target(y)
        t['Price'] = t['Price'].fillna(0)
        X_t = t[features]
        X_ts = preprocessor.fit_transform(X_t)
        y_t = t.IsClick
        
        # split val into features(X) and target(y)
        v['Price'] = v['Price'].fillna(0)
        X_v = v[features]
        X_vs = preprocessor.transform(X_v)
        y_v = v.IsClick
        
        print(f'Cross-Validation Round {i} ROC-AUC')
        print('*' * 20)
        
        # iterate through models and fit/predict
        for model in models:
            name = model.__class__.__name__
            f_start = time.time()
            # fit on scaled/encoded training set, kwargs option available if using xgboost
            model.fit(X_ts, y_t, **kwargs)
            f_end = time.time()
            fit_duration = round(f_end - f_start, 2)
            
            p_start = time.time()
            # predict probability of target class for scaled/encoded validation set
            preds = model.predict_proba(X_vs)[:,1]
            p_end = time.time()
            # get AUC score
            auc = roc_auc_score(y_v, preds)
            pred_duration = round(p_end - p_start, 2)
            
            print(f'{name} = {auc}')
            print(f'Fit Time = {fit_duration} sec; Predict Time = {pred_duration} sec')
            # add AUC score to scores dict for model key
            scores[name].extend([float(auc)])
        print('\n')
    
    # return the dictionary of AUC scores
    return scores

#### Execution

In [47]:
# set models to iterate through
models = [LogisticRegression(solver='liblinear'),
          DecisionTreeClassifier(max_depth=5, random_state=666),
          RandomForestClassifier(n_estimators=150, max_depth=3, n_jobs=-1, random_state=666)]

You might also experiment with using the same class of model, but different hyperparameters.

In [48]:
# run the validation function
%time cv_scores = validate_models(train_processed, train_val_dict, features, 5, models)

Cross-Validation Round 1 ROC-AUC
********************
LogisticRegression = 0.7468582726755095 in 82.35 seconds
DecisionTreeClassifier = 0.7310512834279119 in 51.36 seconds
RandomForestClassifier = 0.7343459341071017 in 335.28 seconds


Cross-Validation Round 2 ROC-AUC
********************
LogisticRegression = 0.7456183712628208 in 94.18 seconds
DecisionTreeClassifier = 0.7338041532987283 in 53.26 seconds
RandomForestClassifier = 0.7368761581583327 in 330.63 seconds


Cross-Validation Round 3 ROC-AUC
********************
LogisticRegression = 0.7501811564318067 in 138.68 seconds
DecisionTreeClassifier = 0.7375553518372434 in 63.15 seconds
RandomForestClassifier = 0.7394734907290246 in 383.12 seconds


Cross-Validation Round 4 ROC-AUC
********************
LogisticRegression = 0.7533457678363117 in 119.7 seconds
DecisionTreeClassifier = 0.7420823633009463 in 77.12 seconds
RandomForestClassifier = 0.7383647888574435 in 447.07 seconds


Cross-Validation Round 5 ROC-AUC
********************
L

Logistic Regression performs the best! 

Plot changes in AUC for various feature sets:

In [None]:
plt.figure(figsize=(10,8))
    
fpr, tpr, _ = roc_curve(basic_f['Round 5'][0], basic_f['Round 5'][1])
plt.plot(fpr,tpr, label='basic')
fpr, tpr, _ = roc_curve(cat_f['Round 5'][0], cat_f['Round 5'][1])
plt.plot(fpr,tpr, label='categories')
fpr, tpr, _ = roc_curve(user_f['Round 5'][0], user_f['Round 5'][1])
plt.plot(fpr,tpr, label='user')
fpr, tpr, _ = roc_curve(ad_f['Round 5'][0], ad_f['Round 5'][1])
plt.plot(fpr,tpr, label='ad')
fpr, tpr, _ = roc_curve(user_cat_f['Round 5'][0], user_cat_f['Round 5'][1])
plt.plot(fpr,tpr, ':',label='user_category')
fpr, tpr, _ = roc_curve(cat_ad_f['Round 5'][0], cat_ad_f['Round 5'][1])
plt.plot(fpr,tpr, ':',label='category_ad')
fpr, tpr, _ = roc_curve(user_ad_f['Round 5'][0], user_ad_f['Round 5'][1])
plt.plot(fpr,tpr, ':', label='user_ad')
fpr, tpr, _ = roc_curve(all_features['Round 5'][0], all_features['Round 5'][1])
plt.plot(fpr,tpr, '-.', color='black', label='all', linewidth= 5)
plt.plot([0,1], '--y')
plt.legend()
plt.title('ROC Per Additional Feature')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.savefig('ROC_features.jpg', dpi=500, bbox_inches='tight')

## Test Eval

#### Setup

In [87]:
def test_eval(train_df, test_df, train_val_dict, features, model, preprocessor=preprocess, **kwargs):
    
    # get indices for splits
    agg_tr_index = train_val_dict[f'agg_set_v5']
    t_index = train_val_dict[f'val_set_5']

    # filter full train_df for splits
    agg_set_tr = train_df.iloc[agg_tr_index]
    train = train_df.iloc[t_index]
    test = test_df
    
    # merge aggregated data    
    train = merge_preprocessed_data(train, agg_set_tr)
    test = merge_preprocessed_data(test, train_df)
    print('Finished Preprocessing')
    
    # split train into features(X) and target(y)
    train['Price'] = train['Price'].fillna(0)
    X_tr = train[features]
    X_tr = preprocessor.fit_transform(X_tr)
    y_tr = train.IsClick
    
    # split test into features(X) and target(y)
    test['Price'] = test['Price'].fillna(0)
    X_te = test[features]
    X_te = preprocessor.transform(X_te)
    y_te = test.IsClick
    
    print('Starting Model Training')
    # fit on scaled/encoded training set, kwargs option available if using xgboost
    model.fit(X_tr, y_tr, **kwargs)
    print('Done Training')
    print('Generating Predictions')
    # predict probability of target class for scaled/encoded test set
    preds = model.predict_proba(X_te)[:,1]
    # get AUC score
    auc = roc_auc_score(y_te, preds)
    print(f'Hold-out AUC score = {auc}')
    
    return preds

#### Execution

In [None]:
# set model for testing
model = LogisticRegression(solver='liblinear')

In [217]:
# run test function
%time test_eval(train_processed, test_processed, train_val_dict, features, model)

Hold-out AUC score = 0.7553444903814677
CPU times: user 7min 12s, sys: 3min 53s, total: 11min 5s
Wall time: 10min 45s


array([0.0125739 , 0.11579401, 0.00193943, ..., 0.00115157, 0.00234092,
       0.00595754])

# Kaggle

#### Setup

In [80]:
kaggle = pd.read_pickle(open('pickle/kaggle_processed.pkl.gzip', 'rb'), compression='gzip')

In [66]:
def check_kaggle_df(kaggle_set):
    '''
    Ensures that kaggle_dataframe is correct type and shape.
    '''
    if type(kaggle_set) != type(pd.DataFrame()):
        raise TypeError('"kaggle_set" parameter must be a DataFrame')
    if kaggle_set.shape != (7816361, 14):
        raise Exception('Kaggle DataFrame is the wrong size. Should be 7816361x14.')
        # columns should be: TestID, SearchID, AdID, UserID, SearchDate, Price,
        #                    Position, title_length, IsUserLoggedOn, CategoryID_s,
        #                    CategoryID_a, common_params_cnt, categories_match, blank_query
    else:
        pass

In [88]:
def predict_kaggle(train_df, test_df, kaggle_set, train_val_dict, features, model, preprocessor=preprocess, **kwargs):
    
    # ensure that kaggle_set is DataFrame with correct shape
    check_kaggle_df(kaggle_set)
    
    # set train/test/agg sets
    agg_set_tr = train_df
    train = test_df
    test = kaggle_set
    
    # merge aggregated data    
    train = merge_preprocessed_data(train, agg_set_tr)
    test = merge_preprocessed_data(test, pd.concat([train_df, test_df]))
    print('Finished Preprocessing')
    
    # split train into features(X) and target(y)
    train['Price'] = train['Price'].fillna(0)
    X_tr = train[features]
    X_tr = preprocess.fit_transform(X_tr)
    y_tr = train.IsClick
    
    # split test into features(X) and target(y)
    test['Price'] = test['Price'].fillna(0)
    X_te = test[features]
    X_te = preprocess.transform(X_te)
    # Note: there is no target(y) in the kaggle dataframe
    
    print('Starting Model Training')
    # fit on scaled/encoded training set, kwargs option available if using xgboost
    model.fit(X_tr, y_tr, **kwargs)
    print('Done Training')
    print('Generating Predictions')
    # predict probability of target class for scaled/encoded kaggle set
    preds = m.predict_proba(X_te)[:,1]
    
    print('Done')
    return preds

#### Execution

In [72]:
# instantiate model with desired hyperparameters
model = LogisticRegression(solver='liblinear')

In [74]:
# run kaggle function
%time preds = predict_kaggle(train_processed, test_processed, train_val_dict, features, model)

finished preprocessing
starting fit
done fitting
CPU times: user 40min 51s, sys: 5min 13s, total: 46min 5s
Wall time: 47min 39s


In [75]:
# set click predictions as column
kaggle['IsClick'] = preds
# write to csv - this is the correct format for submission
kaggle[['TestId', 'IsClick']].to_csv(open('submission.csv', 'w'), header=['ID', 'IsClick'], index=False)

## Note: Keep in mind that we've been using AUC to evaluate rather than the competition metric - LogLoss.

That said, the above gets you around the Top 120 (of 414) or so submissions on the Private Leaderboard with a LogLoss ~0.04751.  Removing some of the features can improve that score a little bit.