In [1]:
import numpy as np
import pandas as pd
import pickle
import xgboost
import time
import gc
import lightgbm as lgb
from xgboost.sklearn import XGBClassifier
#warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score,\
roc_auc_score, f1_score, confusion_matrix, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from collections import Counter, defaultdict
from random import choice

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Loading Data

In [2]:
# sessions
with open ("/Users/gracezhang/Desktop/data/session.pkl", "rb") as f:
    sessions = pickle.load(f)

# event
with open ("/Users/gracezhang/Desktop/data/events.pkl", "rb") as f:
    events = pickle.load(f)

In [3]:
sessions.head()

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,1477540082628742048,1540120743010,Asia/Manila,28800000.0,13499724,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,13,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,8184875317380844086,1542671625528,Asia/Manila,28800000.0,32788010,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,41,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,4706180700083856343,1538997913013,Asia/Manila,28800000.0,5872534,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,4,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [4]:
sessions.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,2201961907282901522,4,1543713091129,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,2201961907282901522,6,1543713093116,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


## Data Preprocessing

#### Create labels for data from Dec 1 to Dec 7 and from Dec 1 to Dec 14 for training

In [499]:
users = pd.DataFrame(list(events.user_id_hash.unique()))

In [500]:
# first training label: Dec1 - Dec7
dec_event = events[(events["event_timestamp"] >= 1543651199000) & 
                  (events["event_timestamp"] < 1544255999000)]
puser1 = set(dec_event[dec_event["event"] == "8"].user_id_hash.unique())

# for training, the users include all unique users
users_train = pd.DataFrame(list(events.user_id_hash.unique()))
users_train.columns = ["user_id_hash"]

labels1 = []
for user in list(events.user_id_hash.unique()):
    if user in puser1:
        labels1.append(1)
    else:
        labels1.append(0)
# label and add to train
users_train["purchased1"] = pd.DataFrame(labels1)

In [501]:
# second training label: Dec1-Dec14
dec2_event = events[(events["event_timestamp"] >= 1543651199000)]
puser2 = set(dec2_event[dec2_event["event"] == "8"].user_id_hash.unique())

labels2 = []
for user in list(events.user_id_hash.unique()):
    if user in puser2:
        labels2.append(1)
    else:
        labels2.append(0)

users_train["purchased2"] = pd.DataFrame(labels2)

In [502]:
users_train.head()

Unnamed: 0,user_id_hash,purchased1,purchased2
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,0
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0


## Feature Engineering

### Features for the first label

#### Use the data from Oct 1 to Nov 30 to generate features

In [493]:
features = events[events["event_timestamp"] < 1543651199000]

In [494]:
features.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
30,18781111175537580,.m6335456823869440,1539215572790,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
31,18781111175537580,1,1539215608399,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


#### Feature 1: average amount purchased

In [495]:
def purchaseAmt(df):
    df = df[df["event"]=="8"][["user_id_hash", "event_value"]].groupby("user_id_hash").mean().reset_index()
    return df

#### Feature 2: average amt of event 5

In [496]:
def event5Amt(df):
    df = df[df["event"]=="5"][["user_id_hash", "event_value"]].groupby("user_id_hash").mean().reset_index()
    return df

#### Feature 3: avg prev session duration

In [483]:
features2 = sessions[sessions["start_timestamp"] < 1543651199000]

In [484]:
def avgPrevSession(df):
    df = df.groupby("user_id_hash")["previous_sessions_duration"] \
         .agg('mean').reset_index(name='avg_duration')
    return df

#### Feature 4: avg num of purchase

In [485]:
def numPurchase(df):
    df = df[df["event"]=="8"].groupby("user_id_hash").count().reset_index()
    return df

In [486]:
day_count = 61

#### Feature 5: avg num of event5

In [487]:
def numEvent5(df):
    df = df[df["event"]=="5"].groupby("user_id_hash").count().reset_index()
    return df

#### Feature 6: avg num of sessions

In [488]:
def avgNumSessions(df):
    df = df.groupby("user_id_hash").count().reset_index()
    return df

#### Feature 7: avg num of event 0

In [577]:
def numEvent0(df):
    df = df[df["event"]=="0"].groupby("user_id_hash").count().reset_index()
    return df

#### Feature 8: time since last event

In [None]:
def timeSinceLastEvent(df, cutoff):
    temp = df.groupby("user_id_hash")['event_timestamp'].agg(['last']).reset_index()
    df['time_since_last'] = cutoff - temp['last']
    return df

### Features for the second label

#### Feature 1: avg num of purchase

In [503]:
day_count = 61
feature1_label2 = distinctPurchase(features[["user_id_hash", "event"]])
feature1_label2.event = feature1_label2.event/day_count
users_train = users_train.join(feature1_label2.set_index("user_id_hash"), on="user_id_hash", how="left")
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase"]

In [504]:
users_train.head()

Unnamed: 0,user_id_hash,purchase1,purchase2,avg_num_purchase
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,0,0.016393
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0.0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0.0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0.0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0.0


#### Feature 2: avg amt purchase

In [505]:
feature2_label2 = purchaseAmt(features[["user_id_hash", "event", "event_value"]])
feature2_label2["event_value"] = feature2_label2["event_value"].fillna(0)
users_train = users_train.join(feature2_label2.set_index("user_id_hash"), on="user_id_hash", how="left")
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase", "avg_amt_purchased"]

In [506]:
users_train.head()

Unnamed: 0,user_id_hash,purchase1,purchase2,avg_num_purchase,avg_amt_purchased
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,0,0.016393,3.492188
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0.0,0.0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0.0,0.0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0.0,0.0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0.0,0.0


#### Feature 3: avg prev session duration

In [507]:
feature3_label2 = avgPrevSession(features2[["user_id_hash", "previous_sessions_duration", "start_timestamp"]])
users_train = users_train.join(feature3_label2.set_index("user_id_hash"), on="user_id_hash", how="left")
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase", "avg_amt_purchased",\
                       "avg_prev_session"]

In [508]:
users_train.describe()

Unnamed: 0,purchase1,purchase2,avg_num_purchase,avg_amt_purchased,avg_prev_session
count,620988.0,620988.0,620988.0,620988.0,618832.0
mean,0.007383,0.009483,0.006217,,3842889.0
std,0.085609,0.096919,0.050412,0.0,12872970.0
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,47750.25
75%,0.0,0.0,0.0,0.0,2282013.0
max,1.0,1.0,7.295082,70.0,827499600.0


#### Feature 4: avg num of sessions

In [509]:
feature4_label2 = avgNumSessions(features2[["user_id_hash", "session_id"]])
feature4_label2.session_id = feature4_label2.session_id/day_count

users_train = users_train.join(feature4_label2.set_index("user_id_hash"), on="user_id_hash", how="left")
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase", "avg_amt_purchased", \
                       "avg_prev_session", "avg_session_count"]

In [510]:
users_train.head()

Unnamed: 0,user_id_hash,purchase1,purchase2,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,0,0.016393,3.492188,20395290.0,0.754098
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0.0,0.0,2024939.0,0.04918
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0.0,0.0,0.0,0.016393
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0.0,0.0,0.0,0.016393
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0.0,0.0,0.0,0.016393


#### Feature 5: avg num of event5

In [511]:
feature5_label2 = numEvent5(features[["user_id_hash", "event"]])
feature5_label2.event = feature5_label2.event/day_count
users_train = users_train.join(feature5_label2.set_index("user_id_hash"), on="user_id_hash", how="left")
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase", "avg_amt_purchased", \
                       "avg_prev_session", "avg_session_count", "avg_num_event5"]

In [513]:
users_train.head()

Unnamed: 0,user_id_hash,purchase1,purchase2,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,0,0.016393,3.492188,20395290.0,0.754098,0.459016
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0.0,0.0,2024939.0,0.04918,0.04918
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0.0,0.0,0.0,0.016393,0.016393
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0.0,0.0,0.0,0.016393,0.016393
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0.0,0.0,0.0,0.016393,0.016393


#### Feature 6: avg amt of event5

In [514]:
feature6_label2 = event5Amt(features[["user_id_hash", "event", "event_value"]])
feature6_label2 = feature6_label2.replace([np.inf, -np.inf], np.nan)
feature6_label2["event_value"] = feature6_label2["event_value"].fillna(0)
users_train = users_train.join(feature6_label2.set_index("user_id_hash"), on="user_id_hash", how="left")

In [515]:
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase", "avg_amt_purchased", \
                        "avg_prev_session", \
                       "avg_session_count", "avg_num_event5", "avg_amt_event5"]

In [516]:
users_train.describe()

Unnamed: 0,purchase1,purchase2,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5,avg_amt_event5
count,620988.0,620988.0,620988.0,620988.0,618832.0,620988.0,620988.0,620988.0
mean,0.007383,0.009483,0.006217,,3842889.0,0.119794,0.115497,
std,0.085609,0.096919,0.050412,0.0,12872970.0,0.332484,0.295984,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0
50%,0.0,0.0,0.0,0.0,47750.25,0.032787,0.032787,0.0
75%,0.0,0.0,0.0,0.0,2282013.0,0.081967,0.081967,0.666504
max,1.0,1.0,7.295082,70.0,827499600.0,18.967213,13.196721,1.0


In [517]:
users_train['avg_amt_purchased'] = users_train['avg_amt_purchased'].astype(np.float32)
users_train['avg_amt_event5'] = users_train['avg_amt_event5'].astype(np.float32)
users_train['avg_prev_session'] = users_train['avg_prev_session'].fillna(users_train['avg_prev_session'].mean())

In [518]:
users_train.describe()

Unnamed: 0,purchase1,purchase2,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5,avg_amt_event5
count,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0
mean,0.007383,0.009483,0.006217,0.161581,3842889.0,0.119794,0.115497,0.307368
std,0.085609,0.096919,0.050412,1.032079,12850600.0,0.332484,0.295984,0.355234
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0
50%,0.0,0.0,0.0,0.0,51502.75,0.032787,0.032787,0.0
75%,0.0,0.0,0.0,0.0,2332285.0,0.081967,0.081967,0.666504
max,1.0,1.0,7.295082,70.0,827499600.0,18.967213,13.196721,1.0


#### Feature 7: avg number of event 0

In [578]:
feature7_label2 = numEvent0(features[["user_id_hash", "event"]])
feature7_label2.event = feature7_label2.event/day_count
users_train = users_train.join(feature7_label2.set_index("user_id_hash"), on="user_id_hash", how="left")
users_train.columns = ["user_id_hash", "purchase1", "purchase2", "avg_num_purchase", "avg_amt_purchased", \
                        "avg_prev_session", "avg_session_count", "avg_num_event5", "avg_amt_event5", \
                      "avg_num_event0"]

In [580]:
users_train.describe()

Unnamed: 0,purchase1,purchase2,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5,avg_amt_event5,avg_num_event0
count,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0
mean,0.007383,0.009483,0.006217,0.161581,3842889.0,0.119794,0.115497,0.307368,0.009762
std,0.085609,0.096919,0.050412,1.032079,12850600.0,0.332484,0.295984,0.355234,0.009374
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0,0.0
50%,0.0,0.0,0.0,0.0,51502.75,0.032787,0.032787,0.0,0.016393
75%,0.0,0.0,0.0,0.0,2332285.0,0.081967,0.081967,0.666504,0.016393
max,1.0,1.0,7.295082,70.0,827499600.0,18.967213,13.196721,1.0,0.52459


#### Feature 8:

#### Feature 9

Other features to consider:
- messages: deliver type 2 - mannual, 3 - trigger locally

## Train, Test Split

In [581]:
from sklearn.model_selection import train_test_split

In [582]:
X = users_train[users_train.columns.difference(['purchase1','purchase2','user_id_hash'])]
y1 = users_train["purchase1"]
y2 = users_train["purchase2"]

In [583]:
X.describe()

Unnamed: 0,avg_amt_event5,avg_amt_purchased,avg_num_event0,avg_num_event5,avg_num_purchase,avg_prev_session,avg_session_count
count,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0,620988.0
mean,0.307368,0.161581,0.009762,0.115497,0.006217,3842889.0,0.119794
std,0.355234,1.032079,0.009374,0.295984,0.050412,12850600.0,0.332484
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393
50%,0.0,0.0,0.016393,0.032787,0.0,51502.75,0.032787
75%,0.666504,0.0,0.016393,0.081967,0.0,2332285.0,0.081967
max,1.0,70.0,0.52459,13.196721,7.295082,827499600.0,18.967213


In [584]:
X_train1, X_val1, y_train1, y_val1 = train_test_split(X, y1, test_size=0.2, random_state=None, stratify=y1)

In [585]:
X_train2, X_val2, y_train2, y_val2 = train_test_split(X, y2, test_size=0.2, random_state=None, stratify=y2)

## Modeling

### Random Forest

In [586]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics

In [587]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_leaf=50, max_depth=15, \
                            max_features=3, verbose=1)

In [588]:
# predict label 1
rf.fit(X_train1, y_train1)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [589]:
pred_train1 = rf.predict_proba(X_train1)
pred_val1 = rf.predict_proba(X_val1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    4.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    9.3s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:   13.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    3.7s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    4.7s finished


In [590]:
# Training AUC 7 day 
sklearn.metrics.roc_auc_score(y_train1, pred_train1[:,-1])

0.97770060143628

In [591]:
# Validation AUC 7 day 
sklearn.metrics.roc_auc_score(y_val1, pred_val1[:,-1])

0.9530239084531701

In [592]:
rf2= RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_leaf=50, max_depth=15, \
                            max_features=3, verbose=1)
# predict label 2
rf2.fit(X_train2, y_train2)
pred_train2 = rf2.predict_proba(X_train2)
pred_val2 = rf2.predict_proba(X_val2)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    7.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   13.0s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:   16.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.9s


In [593]:
print(f"AUC For Training set 14 days purchase is: {sklearn.metrics.roc_auc_score(y_train2, pred_train2[:,-1])}")

AUC For Training set 14 days purchase is: 0.9725696929733454


In [594]:
print(f"AUC For Val set 14 days purchase is: {sklearn.metrics.roc_auc_score(y_val2, pred_val2[:,-1])}")

AUC For Val set 14 days purchase is: 0.9514489927928679


In [595]:
rf.feature_importances_

array([0.04784669, 0.21470481, 0.13361076, 0.09862908, 0.33932541,
       0.10604925, 0.059834  ])

In [596]:
rf2.feature_importances_

array([0.04255977, 0.22273156, 0.13690102, 0.09224767, 0.35291621,
       0.09435769, 0.05828608])

#### Perform random search for hyperparameter tuning

In [632]:
## Random Search
# loop for random search
n_iterations=10

print ("Random search start...")

for col in ['model1_7day','model2_14day']:
    print(f"* training {col} ")
    #y = target[col]
    roc_auc_mean = []
    dict_list = []
    
    for i in range(0, n_iterations):

        param_dist = {'n_estimators' : choice([300, 700,1000]),
                  'max_depth': choice([5, 10, 15]),      
                  'min_samples_split': choice([0.1, 0.3, 0.5]),
                  'max_features': choice([2,3,4])} 

        roc_l = []
        
       # y_train_1,y_train_2,X_train
       # y_val_1,y_val_2,X_val

        # training
        if col == 'model1_7day':
            X_train = X_train1
            X_val = X_val1
            y_train = y_train1
            y_val = y_val1
        else: 
            X_train = X_train2
            X_val = X_val2
            y_train = y_train2
            y_val = y_val2
            
        rf_clf1 = RandomForestClassifier(**param_dist)
        rf_clf1.fit(X_train,y_train)
        # predicting
        y_pred = np.round(rf_clf1.predict_proba(X_val)[:,1],3)
        roc = roc_auc_score(y_val, y_pred)
        roc_l.append(roc)

        roc_array = np.asarray(roc_l)

        roc_auc_mean.append(roc_array.mean())
        dict_list.append(param_dist)
        gc.collect()
        

    results_pd = pd.DataFrame({"roc_auc_mean": roc_auc_mean,"parameters": dict_list})    
    results_pd.sort_values("roc_auc_mean", ascending = False, axis = 0, inplace = True)
    
    top_pd = results_pd.head(1)
    
    print(f"--> Best AUC:{top_pd.iloc[0,0]} using {top_pd.iloc[0,1]}")

Random search start...
* training model1_7day 
--> Best AUC:0.9454227889814226 using {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 0.1, 'max_features': 4}
* training model2_14day 
--> Best AUC:0.9406939124561579 using {'n_estimators': 1000, 'max_depth': 5, 'min_samples_split': 0.1, 'max_features': 4}


### LightGBM

In [445]:
## Random Search
# loop for random search
n_iterations=20

print ("Random search start...")

for col in ['model1_7day','model2_14day']:
    print(f"* training {col} ")
    #y = target[col]
    roc_auc_mean = []
    dict_list = []
    
    for i in range(0, n_iterations):

        param_dist = {'n_estimators' : choice([200,350,500,650]),
                  'bagging_fraction': choice([0.3, 0.5, 0.7, 0.9]),
                  'learning_rate': choice([0.01, 0.05, 0.1, 0.3]),
                  'is_unbalance': True,
                  'max_bin': choice([3, 5, 10, 15, 20, 25]),
                  'boosting_type' : choice(['gbdt', 'dart']),
                  'max_depth': choice([2,5,8,10]),      
                  'feature_fraction': choice([0.7, 0.8, 0.9]),
                  'lambda_l1': choice([0, 10, 20, 30, 40]),
                  'objective': 'binary', 
                  'metric': 'auc'} 

        roc_l = []
        
   nnn          # y_train_1,y_train_2,X_train
       # y_val_1,y_val_2,X_val

        # training
        if col == 'model1_7day':
            X_train = X_train1
            X_val = X_val1
            y_train = y_train1
            y_val = y_val1
        else: 
            X_train = X_train2
            X_val = X_val2
            y_train = y_train2
            y_val = y_val2
            
        gbm = lgb.LGBMClassifier(**param_dist)
        gbm.fit(X_train,y_train)
        # predicting
        y_pred = np.round(gbm.predict_proba(X_val)[:,1],3)
        
        roc = roc_auc_score(y_val, y_pred)
        roc_l.append(roc)

        roc_array = np.asarray(roc_l)

        roc_auc_mean.append(roc_array.mean())
        dict_list.append(param_dist)
        gc.collect()
        

    results_pd = pd.DataFrame({"roc_auc_mean": roc_auc_mean,"parameters": dict_list})    
    results_pd.sort_values("roc_auc_mean", ascending = False, axis = 0, inplace = True)
    
    top_pd = results_pd.head(1)
    
    print(f"--> Best AUC:{top_pd.iloc[0,0]} using {top_pd.iloc[0,1]}")

Random search start...
* training model1_7day 
--> Best AUC:0.9354825134309179 using {'n_estimators': 500, 'bagging_fraction': 0.9, 'learning_rate': 0.01, 'is_unbalance': True, 'max_bin': 20, 'boosting_type': 'gbdt', 'max_depth': 2, 'feature_fraction': 0.7, 'lambda_l1': 10, 'objective': 'binary', 'metric': 'auc'}
* training model2_14day 
--> Best AUC:0.9318218544391723 using {'n_estimators': 200, 'bagging_fraction': 0.5, 'learning_rate': 0.05, 'is_unbalance': True, 'max_bin': 25, 'boosting_type': 'gbdt', 'max_depth': 2, 'feature_fraction': 0.9, 'lambda_l1': 20, 'objective': 'binary', 'metric': 'auc'}


In [446]:
# using the results from random search
para_list_7days = {'n_estimators': 500, 'bagging_fraction': 0.9, 'learning_rate': 0.01, \
                   'is_unbalance': True, 'max_bin': 20, 'boosting_type': 'gbdt', 'max_depth': 2, \
                   'feature_fraction': 0.7, 'lambda_l1': 10, 'objective': 'binary', 'metric': 'auc'}
t = time.time()
# Model for 7 days
param = para_list_7days
gbm = lgb.LGBMClassifier(**param)
gbm.fit(X_train1, y_train1)
# predicting
probabilities = gbm.predict_proba(X_val1)
#preds = gbm.predict(X_val_m1)
score = probabilities[:, 1]

print(f'auc score = {roc_auc_score(y_val1,score)}')
print(f"Time use:{time.time()-t:.3f}s") 

auc score = 0.9354988824858163
Time use:3.755s


In [448]:
pd.DataFrame(sorted(zip(gbm.feature_importances_,X_train1.columns)), columns=['Value','Feature'])

Unnamed: 0,Value,Feature
0,118,avg_prev_session
1,217,avg_session_count
2,227,avg_num_event5
3,268,avg_amt_purchased
4,276,avg_amt5
5,394,avg_num_purchase


In [447]:
# using the results from random search
para_list_14days = {'n_estimators': 200, 'bagging_fraction': 0.5, 'learning_rate': 0.05, \
                    'is_unbalance': True, 'max_bin': 25, 'boosting_type': 'gbdt', 'max_depth': 2,\
                    'feature_fraction': 0.9, 'lambda_l1': 20, 'objective': 'binary', 'metric': 'auc'}


t = time.time()
# Model for 7 days
param = para_list_14days
gbm2 = lgb.LGBMClassifier(**param)
gbm2.fit(X_train2, y_train2)
# predicting
probabilities = gbm2.predict_proba(X_val2)
#preds = gbm2.predict(X_val)
score = probabilities[:, 1]
print(f'auc score = {roc_auc_score(y_val2,score)}')
print(f"Time use:{time.time()-t:.3f}s")

auc score = 0.9318101753852329
Time use:1.665s


In [449]:
pd.DataFrame(sorted(zip(gbm2.feature_importances_,X_train2.columns)), columns=['Value','Feature'])

Unnamed: 0,Value,Feature
0,29,avg_amt_purchased
1,92,avg_prev_session
2,101,avg_num_purchase
3,124,avg_num_event5
4,125,avg_session_count
5,129,avg_amt_event5


## Compute Target

In [597]:
sample = pd.read_csv("/Users/gracezhang/Desktop/leanplum/sample_submission_2.csv")

In [598]:
users_test = sample[["user_id_hash"]]

In [599]:
len(users_test)

312568

In [600]:
features_test = events

In [601]:
features_test2 = sessions

#### TODO: use attributes data to get missing users features instead of imputing using mean

#### Feature 1: avg num of purchase

In [602]:
day_count2 = 75
f1_test = distinctPurchase(features_test[["user_id_hash", "event"]])
f1_test.event = f1_test.event/day_count2
users_test = users_test.join(f1_test.set_index("user_id_hash"), on="user_id_hash", how="left")
users_test.columns = ["user_id_hash", "avg_num_purchase"]

In [603]:
users_test.head()

Unnamed: 0,user_id_hash,avg_num_purchase
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0


#### Feature 2: avg amt purchase

In [604]:
f2_test = purchaseAmt(features_test[["user_id_hash", "event", "event_value"]])
f2_test["event_value"] = f2_test["event_value"].fillna(0)
users_test = users_test.join(f2_test.set_index("user_id_hash"), on="user_id_hash", how="left")
users_test.columns = ["user_id_hash", "avg_num_purchase", "avg_amt_purchased"]

In [605]:
users_test.head()

Unnamed: 0,user_id_hash,avg_num_purchase,avg_amt_purchased
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0,0.0
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0,0.0
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0,0.0
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0,0.0
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0


#### Feature 3: avg prev session duration

In [606]:
f3_test = avgPrevSession(features_test2[["user_id_hash", "previous_sessions_duration", "start_timestamp"]])
users_test = users_test.join(f3_test.set_index("user_id_hash"), on="user_id_hash", how="left")
users_test.columns = ["user_id_hash", "avg_num_purchase", "avg_amt_purchased", "avg_prev_session"]

In [607]:
users_test.describe()

Unnamed: 0,avg_num_purchase,avg_amt_purchased,avg_prev_session
count,312316.0,312316.0,311565.0
mean,0.006875,0.185059,4685708.0
std,0.053262,0.0,16218780.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,77170.5
75%,0.0,0.0,2548936.0
max,3.146667,70.0,820708000.0


#### Feature 4: avg num of sessions

In [608]:
f4_test = avgNumSessions(features_test2[["user_id_hash", "session_id"]])
f4_test.session_id = f4_test.session_id/day_count2

users_test = users_test.join(f4_test.set_index("user_id_hash"), on="user_id_hash", how="left")
users_test.columns = ["user_id_hash", "avg_num_purchase", "avg_amt_purchased", "avg_prev_session", \
                       "avg_session_count"]

In [609]:
users_test.head()

Unnamed: 0,user_id_hash,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0,0.0,356544.0,0.026667
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0,0.0,0.0,0.013333
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0,0.0,388631.25,0.053333
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0,0.0,9200582.9,0.133333
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0,0.0,0.013333


#### Feature 5: avg num of event5

In [610]:
f5_test = numEvent5(features_test[["user_id_hash", "event"]])
f5_test.event = f5_test.event/day_count2
users_test = users_test.join(f5_test.set_index("user_id_hash"), on="user_id_hash", how="left")
users_test.columns = ["user_id_hash", "avg_num_purchase", "avg_amt_purchased", "avg_prev_session", \
                       "avg_session_count", "avg_num_event5"]

In [611]:
users_test.head()

Unnamed: 0,user_id_hash,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0,0.0,356544.0,0.026667,0.013333
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0,0.0,0.0,0.013333,0.04
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0,0.0,388631.25,0.053333,0.013333
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0,0.0,9200582.9,0.133333,0.12
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0,0.0,0.013333,0.0


#### Feature 6: avg amt of event5

In [612]:
f6_test = event5Amt(features_test[["user_id_hash", "event", "event_value"]])
f6_test = f6_test.replace([np.inf, -np.inf], np.nan)
f6_test["event_value"] = f6_test["event_value"].fillna(0)
users_test = users_test.join(f6_test.set_index("user_id_hash"), on="user_id_hash", how="left")

In [613]:
users_test.columns = ["user_id_hash", "avg_num_purchase", "avg_amt_purchased", "avg_prev_session", \
                       "avg_session_count", "avg_num_event5", "avg_amt_event5"]

In [614]:
users_test.describe()

Unnamed: 0,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5,avg_amt_event5
count,312316.0,312316.0,311565.0,312368.0,312316.0,312316.0
mean,0.006875,0.185059,4685708.0,0.115584,0.109689,
std,0.053262,0.0,16218780.0,0.334684,0.298179,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.013333,0.013333,0.0
50%,0.0,0.0,77170.5,0.026667,0.026667,0.0
75%,0.0,0.0,2548936.0,0.066667,0.08,0.666504
max,3.146667,70.0,820708000.0,18.693333,11.746667,1.0


#### Feature 7: avg num of event0

In [615]:
f7_test = numEvent0(features_test[["user_id_hash", "event"]])
f7_test.event = f7_test.event/day_count2
users_test = users_test.join(f7_test.set_index("user_id_hash"), on="user_id_hash", how="left")
users_test.columns = ["user_id_hash", "avg_num_purchase", "avg_amt_purchased", "avg_prev_session", \
                       "avg_session_count", "avg_num_event5", "avg_amt_event5", "avg_num_event0"]

In [616]:
users_test.describe()

Unnamed: 0,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5,avg_amt_event5,avg_num_event0
count,312316.0,312316.0,311565.0,312368.0,312316.0,312316.0,312316.0
mean,0.006875,0.185059,4685708.0,0.115584,0.109689,,0.008688
std,0.053262,0.0,16218780.0,0.334684,0.298179,0.0,0.007628
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.013333,0.013333,0.0,0.0
50%,0.0,0.0,77170.5,0.026667,0.026667,0.0,0.013333
75%,0.0,0.0,2548936.0,0.066667,0.08,0.666504,0.013333
max,3.146667,70.0,820708000.0,18.693333,11.746667,1.0,0.44


#### Impute Missing Values

In [617]:
users_test['avg_amt_purchased'] = users_test['avg_amt_purchased'].astype(np.float32)
users_test['avg_amt_event5'] = users_test['avg_amt_event5'].astype(np.float32)
users_test['avg_prev_session'] = users_test['avg_prev_session'].fillna(users_test['avg_prev_session'].mean())

In [618]:
users_test["avg_num_purchase"].fillna(users_test["avg_num_purchase"].mean(), inplace=True)
users_test["avg_amt_purchased"].fillna(users_test["avg_amt_purchased"].mean(), inplace=True)
users_test["avg_prev_session"].fillna(users_test["avg_prev_session"].mean(), inplace=True)
users_test["avg_session_count"].fillna(users_test["avg_session_count"].mean(), inplace=True)
users_test["avg_num_event5"].fillna(users_test["avg_num_event5"].mean(), inplace=True)
users_test["avg_amt_event5"].fillna(users_test["avg_amt_event5"].mean(), inplace=True)

In [619]:
users_test['avg_amt_event5'] = users_test['avg_amt_event5'].astype(np.float32)
users_test['avg_num_purchase'] = users_test['avg_num_purchase'].astype(np.float32)
users_test['avg_amt_purchased'] = users_test['avg_amt_purchased'].astype(np.float32)
users_test['avg_prev_session'] = users_test['avg_prev_session'].astype(np.float32)
users_test['avg_session_count'] = users_test['avg_session_count'].astype(np.float32)
users_test['avg_num_event5'] = users_test['avg_num_event5'].astype(np.float32)

In [620]:
users_test.describe()

Unnamed: 0,avg_num_purchase,avg_amt_purchased,avg_prev_session,avg_session_count,avg_num_event5,avg_amt_event5,avg_num_event0
count,312568.0,312568.0,312568.0,312568.0,312568.0,312568.0,312316.0
mean,0.006874,0.185145,4685477.0,0.115615,0.109732,0.314556,0.008688
std,0.053323,1.100851,16185280.0,0.334409,0.298229,0.359175,0.007628
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.013333,0.013333,0.0,0.0
50%,0.0,0.0,81708.42,0.026667,0.026667,0.0,0.013333
75%,0.0,0.0,2603084.0,0.066667,0.08,0.666504,0.013333
max,3.146667,70.0,820708000.0,18.693333,11.746667,1.0,0.44


In [621]:
X_test = users_test[users_test.columns.difference(['user_id_hash'])]

In [625]:
X_test["avg_num_purchase"].fillna(X_test["avg_num_purchase"].mean(), inplace=True)
X_test["avg_amt_purchased"].fillna(X_test["avg_amt_purchased"].mean(), inplace=True)
X_test["avg_prev_session"].fillna(X_test["avg_prev_session"].mean(), inplace=True)
X_test["avg_session_count"].fillna(X_test["avg_session_count"].mean(), inplace=True)
X_test["avg_num_event5"].fillna(X_test["avg_num_event5"].mean(), inplace=True)
X_test["avg_amt_event5"].fillna(X_test["avg_amt_event5"].mean(), inplace=True)
X_test["avg_num_event0"].fillna(X_test["avg_num_event0"].mean(), inplace=True)

In [626]:
X_test['avg_amt_event5'] = X_test['avg_amt_event5'].astype(np.float32)
X_test['avg_num_purchase'] = X_test['avg_num_purchase'].astype(np.float32)
X_test['avg_amt_purchased'] = X_test['avg_amt_purchased'].astype(np.float32)
X_test['avg_prev_session'] = X_test['avg_prev_session'].astype(np.float32)
X_test['avg_session_count'] = X_test['avg_session_count'].astype(np.float32)
X_test['avg_num_event5'] = X_test['avg_num_event5'].astype(np.float32)
X_test['avg_num_event0'] = X_test['avg_num_event0'].astype(np.float32)

In [627]:
pred_1 = rf.predict_proba(X_test)
pred_2 = rf2.predict_proba(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    3.8s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    4.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    5.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    6.6s finished


In [628]:
sample["user_purchase_binary_7_days"] = pd.DataFrame(pred_1[:,-1])
sample["user_purchase_binary_14_days"] = pd.DataFrame(pred_2[:,-1])

In [629]:
sample.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.002825331,0.003155
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.000125099,0.000215
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,3.431668e-05,5.8e-05
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01174442,0.013906
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,9.729984e-07,3e-06


In [630]:
sample["user_purchase_binary_7_days"] = sample["user_purchase_binary_7_days"].astype('double', copy=False)
sample["user_purchase_binary_14_days"] = sample["user_purchase_binary_14_days"].astype('double', copy=False)

In [631]:
sample.to_csv("/Users/gracezhang/Desktop/submission2.csv", index=False)