In [1]:
import numpy as np
import pandas as pd
import pickle
import xgboost
import time
import gc
import lightgbm as lgb
from xgboost.sklearn import XGBClassifier

#warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score,\
roc_auc_score, f1_score, confusion_matrix, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from collections import Counter, defaultdict
from random import choice

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Load data 

In [2]:
# sessions
with open ("/Users/nanlin/msds/Adv_ML/leanplum/data/LeanPlum/session.pkl", "rb") as f:
    sessions = pickle.load(f)

# event
with open ("/Users/nanlin/msds/Adv_ML/leanplum/data/LeanPlum/events.pkl", "rb") as f:
    events = pickle.load(f)

# choose certain columns    
event_df = events[["user_id_hash", "event_timestamp", "event", "event_value"]] 
session_part = sessions[["user_id_hash", "previous_sessions_duration", "start_timestamp"]]

### Label data

- Labels for validation set : Dec1 - Dec7, Dec1 - Dec15
- Features for validation set :  Oct15 - Nov30

- Labels for training set : Nov15 - Dec23, Nov15 - Nov30
- Features for training set :  Oct1 - Nov15

In [3]:
users = pd.DataFrame(list(events.user_id_hash.unique()))
users.columns = ["user_id_hash"]

# first validation label: Dec1 - Dec7

dec_event = event_df[(event_df["event_timestamp"] >= 1543651199000) & 
                  (event_df["event_timestamp"] < 1544255999000)]
puser1 = set(dec_event[dec_event["event"] == "8"].user_id_hash.unique())

# for validation, the users include all unique users
users_val = users
labels1 = []
for user in list(events.user_id_hash.unique()):
    if user in puser1:
        labels1.append(1)
    else:
        labels1.append(0)
# label and add to val
users_val["purchased1"] = pd.DataFrame(labels1)

In [4]:
# second validation label: Dec1-Dec14

dec2_event = event_df[(event_df["event_timestamp"] >= 1543651199000)]
puser2 = set(dec2_event[dec2_event["event"] == "8"].user_id_hash.unique())

labels2 = []
for user in list(events.user_id_hash.unique()):
    if user in puser2:
        labels2.append(1)
    else:
        labels2.append(0)

users_val["purchased2"] = pd.DataFrame(labels2)

In [5]:
# first training label : Nov 15, Nov 23
nov_event = event_df[(event_df["event_timestamp"] >= 1542268800000) & (event_df["event_timestamp"] < 1542960000000)]

puser1 = set(nov_event[nov_event["event"] == "8"].user_id_hash.unique())
users_train = users
labels1 = []

for user in list(events.user_id_hash.unique()):
    if user in puser1:
        labels1.append(1)
    else:
        labels1.append(0)

users_train["purchased1"] = pd.DataFrame(labels1)

In [6]:
# second training label: Nov 15, Nov 30
nov2_event = event_df[(event_df["event_timestamp"] >= 1542268800000) & (event_df["event_timestamp"] < 1543478400000)]
puser2 = set(event_df[event_df["event"] == "8"].user_id_hash.unique())
labels2 = []

for user in list(events.user_id_hash.unique()):
    if user in puser2:
        labels2.append(1)
    else:
        labels2.append(0)

users_train["purchased2"] = pd.DataFrame(labels2)

users_train.head()

Unnamed: 0,user_id_hash,purchased1,purchased2
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,1
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0


In [10]:
val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
                   (event_df["event_timestamp"] < 1543651199000)]
#val_feature = val_feature[[]].apply(lambda x: x.fillna(0))

session_raw_val = session_part[(session_part['start_timestamp']>= 1539586800000) &  # Oct15 - Nov30
                   (session_part['start_timestamp']< 1543651199000)]

train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
session_raw_train = session_part[(session_part['start_timestamp']< 1542268800000)]
#train_feature = train_feature.apply(lambda x: x.fillna(0))

### Extract features from Event and Session

In [32]:
###### First version, not used anymore ######

#event_list = event_df.event.value_counts()[:50].index
# First version, not used anymore
def features_event_2(val_raw,val_user,cut_time,session_raw):# cut time is the cut stamp for feature/label 
    
    eventvalue = val_raw[val_raw["event"]=="8"][["user_id_hash", "event_value"]].\
                groupby("user_id_hash").sum().reset_index()
    result = val_user.join(eventvalue.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 2: number of purchase
    purchase_times = val_raw[val_raw["event"]=="8"].groupby("user_id_hash")\
                    .size().reset_index(name='purchase_counts')
    result = result.join(purchase_times.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 3: whether or not purchase
    #user_purd = set(val_raw[val_raw['event']=='8'].user_id_hash)
    #result['past_buy'] = [1 if uid in user_purd else 0 for uid in result['user_id_hash']]
    
    # feature 4: total count of different events in event: top 50 events
    event_list = ['0','5','1']
    
    for e in event_list:
        event_tmp = val_raw[val_raw["event"]==e].groupby("user_id_hash").size().reset_index(name=f'event{e}_count')
        result = result.join(event_tmp.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 5: median, last event time difference to cutoff time
    first_last = val_raw.groupby("user_id_hash")['event_timestamp'].agg(['median','last']).reset_index()
    first_last['median_diff'] = cut_time - first_last['median']
    first_last['last_diff'] = cut_time - first_last['last']
    first_last['user_id_hash'] = first_last['user_id_hash'].astype(str)
    result = result.join(first_last[['user_id_hash','median_diff','last_diff']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")

    '''
    session_avg = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('mean').reset_index(name='avg_duration')
    session_avg['avg_duration'] = session_avg['avg_duration'].fillna(0)
    result = result.join(session_avg[['user_id_hash','avg_duration']].set_index("user_id_hash"),\
                 on="user_id_hash", how="left")  
    '''
    
    session_sum = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('sum').reset_index(name='sum_duration')
    session_sum['sum_duration'] = session_sum['sum_duration'].fillna(0)
    result = result.join(session_sum[['user_id_hash','sum_duration']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    session_count = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('count').reset_index(name='session_count')
    session_count['session_count'] = session_count['session_count'].fillna(0)
    result = result.join(session_count[['user_id_hash','session_count']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    #result[['median_diff','last_diff','avg_duration','sum_duration']] = \
    result[['median_diff','last_diff','avg_duration','sum_duration']].apply(lambda x:x/3.6e+6)
    # 8.64e+7 = 1 day in millseconds
    
    return result
    

#train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
train_feature_2 = features_event_2(train_raw,users_train,1542268800000,session_raw_train)

#val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
#                   (event_df["event_timestamp"] < 1543651199000)]
val_feature_2 = features_event_2(val_raw,users_val,1543651199000, session_raw_val)


Feature list for model1(7 days):
- event_value
- sum_duration

In [89]:
def features_model_1(val_raw,val_user,cut_time,session_raw):# cut time is the cut stamp for feature/label 
    
    eventvalue = val_raw[val_raw["event"]=="8"][["user_id_hash", "event_value"]].\
                groupby("user_id_hash").sum().reset_index()
    result = val_user.join(eventvalue.set_index("user_id_hash"), on="user_id_hash", how="left")
        
    # feature 3: whether or not purchase
    #user_purd = set(val_raw[val_raw['event']=='8'].user_id_hash)
    #result['past_buy'] = [1 if uid in user_purd else 0 for uid in result['user_id_hash']]
    
    # feature 4: total count of different events in event: top 50 events

    
    session_sum = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('sum').reset_index(name='sum_duration')
    session_sum['sum_duration'] = session_sum['sum_duration'].fillna(0)
    result = result.join(session_sum[['user_id_hash','sum_duration']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    

    
    return result
    

#train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
train_feature_m1 = features_model_1(train_raw,users_train,1542268800000,session_raw_train)

#val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
#                   (event_df["event_timestamp"] < 1543651199000)]
val_feature_m1 = features_model_1(val_raw,users_val,1543651199000, session_raw_val)

In [62]:
train_feature_m1.head()

Unnamed: 0,user_id_hash,purchased1,purchased2,event_value,sum_duration
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,1,3.492188,857156805.0
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0.0,6074816.0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0.0,0.0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0.0,0.0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0.0,0.0


Feature list for model1(15 days):
- median_diff
- purchase_counts
- last_diff
- session_count
- sum_duration
- event 5

In [85]:
def features_model_2(val_raw,val_user,cut_time,session_raw):# cut time is the cut stamp for feature/label 
    
    # feature 2: number of purchase
    purchase_times = val_raw[val_raw["event"]=="8"].groupby("user_id_hash")\
                    .size().reset_index(name='purchase_counts')
    result = val_user.join(purchase_times.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 3: whether or not purchase
    #user_purd = set(val_raw[val_raw['event']=='8'].user_id_hash)
    #result['past_buy'] = [1 if uid in user_purd else 0 for uid in result['user_id_hash']]
    
    # feature 4: total count of different events in event: top 50 events

    # feature 5: median, last event time difference to cutoff time
    first_last = val_raw.groupby("user_id_hash")['event_timestamp'].agg(['median','last']).reset_index()
    first_last['median_diff'] = cut_time - first_last['median']
    first_last['last_diff'] = cut_time - first_last['last']
    first_last['user_id_hash'] = first_last['user_id_hash'].astype(str)
    result = result.join(first_last[['user_id_hash','median_diff','last_diff']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")
    
    session_sum = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('sum').reset_index(name='sum_duration')
    session_sum['sum_duration'] = session_sum['sum_duration'].fillna(0)
    result = result.join(session_sum[['user_id_hash','sum_duration']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    session_count = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('count').reset_index(name='session_count')
    session_count['session_count'] = session_count['session_count'].fillna(0)
    result = result.join(session_count[['user_id_hash','session_count']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    event_list = ['5']
    
    for e in event_list:
        event_tmp = val_raw[val_raw["event"]==e].groupby("user_id_hash").size().reset_index(name=f'event{e}_count')
        result = result.join(event_tmp.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    
    return result
    

#train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
train_feature_m2 = features_model_2(train_raw,users_train,1542268800000,session_raw_train)

#val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
#                   (event_df["event_timestamp"] < 1543651199000)]
val_feature_m2 = features_model_2(val_raw,users_val,1543651199000, session_raw_val)

In [86]:
train_feature_m2.head()

Unnamed: 0,user_id_hash,purchased1,purchased2,purchase_counts,median_diff,last_diff,sum_duration,session_count,event5_count
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,1,1,2162291000.0,326596900.0,857156805.0,35,22
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0,381545500.0,339883700.0,6074816.0,3,3
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0,2438209000.0,2437954000.0,0.0,1,1
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0,,,0.0,1,0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0,2045302000.0,1902918000.0,0.0,1,1


#### X,y split for MODEL1

In [90]:
X_train_m1 = train_feature_m1[train_feature_m1.columns.difference(['purchased1','purchased2','user_id_hash'])]
y_train_1 = train_feature_m1[["purchased1"]]
X_val_m1 = val_feature_m1[val_feature_m1.columns.difference(["purchased1","purchased2",'user_id_hash'])]
y_val_1 = val_feature_m1[['purchased1']]

#### X,y split for MODEL2

In [87]:
X_train_m2 = train_feature_m2[train_feature_m2.columns.difference(['purchased1','purchased2','user_id_hash'])]
y_train_2 = train_feature_m2[["purchased2"]]
X_val_m2 = val_feature_m2[val_feature_m2.columns.difference(["purchased1","purchased2",'user_id_hash'])]
y_val_2 = val_feature_m2[['purchased2']]

### Random Search on Light GBM Classifier

In [78]:
## Random Search
# loop for random search
n_iterations=20

print ("Random search start...")

for col in ['model1_7day','model2_14day']:
    print(f"* training {col} ")
    #y = target[col]
    roc_auc_mean = []
    dict_list = []
    
    for i in range(0, n_iterations):

        param_dist = {'n_estimators' : choice([250,300,350,400,450]),
                  'bagging_fraction': choice([0.5, 0.7, 0.8, 0.9]),
                  'learning_rate': choice([0.05, 0.1, 0.3, 0.5]),
                  'is_unbalance': True,
                  'max_bin': choice([3, 5, 10, 15, 18, 20, 25]),
                  'boosting_type' : choice(['gbdt', 'dart']),
                  'max_depth': choice([2,3,4,5]),      
                  'feature_fraction': choice([0.7, 0.8, 0.9]),
                  'lambda_l1': choice([0, 10, 20, 30, 40]),
                  'objective': 'binary', 
                  'metric': 'auc'} 

        roc_l = []
        
       # y_train_1,y_train_2,X_train
       # y_val_1,y_val_2,X_val

        # training
        if col == 'model1_7day':
            X_train = X_train_m1
            X_val = X_val_m1
            y_train = y_train_1
            y_val = y_val_1
        else: 
            X_train = X_train_m2
            X_val = X_val_m2
            y_train = y_train_2
            y_val = y_val_2
            
        gbm = lgb.LGBMClassifier(**param_dist)
        gbm.fit(X_train,y_train)
        # predicting
        y_pred = np.round(gbm.predict_proba(X_val)[:,1],3)
        
        roc = roc_auc_score(y_val, y_pred)
        roc_l.append(roc)

        roc_array = np.asarray(roc_l)

        roc_auc_mean.append(roc_array.mean())
        dict_list.append(param_dist)
        gc.collect()
        

    results_pd = pd.DataFrame({"roc_auc_mean": roc_auc_mean,"parameters": dict_list})    
    results_pd.sort_values("roc_auc_mean", ascending = False, axis = 0, inplace = True)
    
    top_pd = results_pd.head(1)
    
    print(f"--> Best AUC:{top_pd.iloc[0,0]} using {top_pd.iloc[0,1]}")

Random search start...
* training model1_7day 
--> Best AUC:0.9743335859063035 using {'n_estimators': 350, 'bagging_fraction': 0.7, 'learning_rate': 0.1, 'is_unbalance': True, 'max_bin': 3, 'boosting_type': 'dart', 'max_depth': 4, 'feature_fraction': 0.9, 'lambda_l1': 0, 'objective': 'binary', 'metric': 'auc'}
* training model2_14day 
--> Best AUC:0.9658687632067481 using {'n_estimators': 250, 'bagging_fraction': 0.7, 'learning_rate': 0.1, 'is_unbalance': True, 'max_bin': 18, 'boosting_type': 'gbdt', 'max_depth': 2, 'feature_fraction': 0.9, 'lambda_l1': 30, 'objective': 'binary', 'metric': 'auc'}


### Model for 7 days

In [91]:
# using the results from random search
para_list_7days = {'n_estimators': 350, 'bagging_fraction': 0.7, 'learning_rate': 0.1, \
                   'is_unbalance': True, 'max_bin': 3, 'boosting_type': 'dart', 'max_depth': 4, \
                   'feature_fraction': 0.9, 'lambda_l1': 0, 'objective': 'binary', 'metric': 'auc'}
t = time.time()
# Model for 7 days
param = para_list_7days
gbm = lgb.LGBMClassifier(**param)
gbm.fit(X_train_m1, y_train_1)
# predicting
probabilities = gbm.predict_proba(X_val_m1)
#preds = gbm.predict(X_val_m1)
score = probabilities[:, 1]

print(f'auc score = {roc_auc_score(y_val_1,score)}')
print(f"Time use:{time.time()-t:.3f}s") 

auc score = 0.9743335859063035
Time use:27.083s


### Feature importance

In [92]:
pd.DataFrame(sorted(zip(gbm.feature_importances_,X_train.columns)), columns=['Value','Feature'])
#sort_values(by=['value'])

Unnamed: 0,Value,Feature
0,350,event5_count
1,350,last_diff


### Model for 14 days

In [93]:
# using the results from random search
para_list_14days = {'n_estimators': 250, 'bagging_fraction': 0.7, 'learning_rate': 0.1, \
                    'is_unbalance': True, 'max_bin': 18, 'boosting_type': 'gbdt', 'max_depth': 2,\
                    'feature_fraction': 0.9, 'lambda_l1': 30, 'objective': 'binary', 'metric': 'auc'}

t = time.time()
# Model for 7 days
param = para_list_14days
gbm2 = lgb.LGBMClassifier(**param)
gbm2.fit(X_train_m2, y_train_2)
# predicting
probabilities = gbm2.predict_proba(X_val_m2)
#preds = gbm2.predict(X_val)
score = probabilities[:, 1]
print(f'auc score = {roc_auc_score(y_val_2,score)}')
print(f"Time use:{time.time()-t:.3f}s")

auc score = 0.966007955561583
Time use:6.430s


In [94]:
# feature importance for model 2
pd.DataFrame(sorted(zip(gbm2.feature_importances_,X_train.columns)), columns=['Value','Feature'])
#sort_values(by=['value'])

Unnamed: 0,Value,Feature
0,60,last_diff
1,83,session_count
2,93,median_diff
3,103,purchase_counts
4,139,event5_count
5,172,sum_duration


### Predict on the test set

- test features: Nov1 - Dec15

In [95]:
sample = pd.read_csv("/Users/nanlin/msds/Adv_ML/leanplum/data/LeanPlum/sample_submission_2.csv")

In [96]:
#users = pd.DataFrame(list(events.user_id_hash.unique()))
#users.columns = ["user_id_hash"]
session_raw_test = session_part[(session_part['start_timestamp'] >= 1541055600000)] # Nov 1
test_raw = event_df[(event_df["event_timestamp"] >= 1541055600000)] # November 1
user_test = users
test_feature_m1 = features_model_1(test_raw,user_test,1544860800000,session_raw_test) # Dec 15
X_test_m1 = test_feature_m1[test_feature_m1.columns.difference(['purchased1','purchased2','user_id_hash'])]
X_test_m1.head()

Unnamed: 0,event_value,sum_duration
0,3.492188,857156805.0
1,0.0,6074816.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [97]:
test_feature_m2 = features_model_2(test_raw,user_test,1544860800000,session_raw_test) # Dec 15
X_test_m2 = test_feature_m2[test_feature_m2.columns.difference(['purchased1','purchased2','user_id_hash'])]
X_test_m2.head()

Unnamed: 0,event5_count,last_diff,median_diff,purchase_counts,session_count,sum_duration
0,17,2918597000.0,2670147000.0,1,35,857156805.0
1,3,2931884000.0,2973545000.0,0,3,6074816.0
2,0,,,0,1,0.0
3,1,2337255000.0,2337667000.0,0,1,0.0
4,0,,,0,1,0.0


In [98]:
pred_1 = gbm.predict_proba(X_test_m1)
pred_2 = gbm2.predict_proba(X_test_m2)

test_feature["user_purchase_binary_7_days"] = pd.DataFrame(pred_1[:,-1])
test_feature["user_purchase_binary_14_days"] = pd.DataFrame(pred_2[:,-1])

In [99]:
sub = sample[['user_id_hash']].merge(test_feature[['user_id_hash','user_purchase_binary_7_days',\
                                             'user_purchase_binary_14_days']],on='user_id_hash',\
                               how='left')
# for users do not have previous data, mark as 0
sub = sub.fillna(0)

In [100]:
sub.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.158793,0.101043
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.050371,0.003373
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.158793,0.032277
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.704024,0.38379
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.050371,0.174208


In [30]:
sub.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.188482,0.095764
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.005693,0.000845
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.008461,0.005554
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.438206,0.584298
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.189409,0.213411


In [101]:
sub.to_csv("/Users/nanlin/msds/Adv_ML/leanplum/data/LeanPlum/submission_9", index=False)