In [4]:
import numpy as np
import pandas as pd
import pickle
import xgboost
import time
import gc
import os
import lightgbm as lgb
from xgboost.sklearn import XGBClassifier

#warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score,\
roc_auc_score, f1_score, confusion_matrix, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from collections import Counter, defaultdict
from random import choice

### Load data 

In [5]:
# sessions
with open (os.path.expanduser("~/USF/adv_ml/final/LeanPlum/session.pkl"), "rb") as f:
    sessions = pickle.load(f)

# event
with open (os.path.expanduser("~/USF/adv_ml/final/LeanPlum/events.pkl"), "rb") as f:
    events = pickle.load(f)

# choose certain columns    
event_df = events[["user_id_hash", "event_timestamp", "event", "event_value"]] 
session_part = sessions[["user_id_hash", "previous_sessions_duration", "start_timestamp"]]

### Label data

- Labels for validation set : Dec1 - Dec7, Dec1 - Dec15
- Features for validation set :  Oct15 - Nov30

- Labels for training set : Nov15 - Dec23, Nov15 - Nov30
- Features for training set :  Oct1 - Nov15

In [10]:
users = pd.DataFrame(list(events.user_id_hash.unique()))
users.columns = ["user_id_hash"]

# first validation label: Dec1 - Dec7

dec_event = event_df[(event_df["event_timestamp"] >= 1543651199000) &  #  >= Fri Nov 30 2018 23:59:59 GMT-0800
                  (event_df["event_timestamp"] < 1544255999000)] #  < Fri Dec 07 2018 23:59:59 GMT-0800
puser1 = set(dec_event[dec_event["event"] == "8"].user_id_hash.unique())

# for validation, the users include all unique users
users_val = users.copy()
labels1 = []
for user in list(events.user_id_hash.unique()):
    if user in puser1:
        labels1.append(1)
    else:
        labels1.append(0)
# label and add to val
users_val["purchased1"] = pd.DataFrame(labels1)

In [11]:
# second validation label: Dec1-Dec14

dec2_event = event_df[(event_df["event_timestamp"] >= 1543651199000)] #  >= Fri Nov 30 2018 23:59:59 GMT-0800
puser2 = set(dec2_event[dec2_event["event"] == "8"].user_id_hash.unique())

labels2 = []
for user in list(events.user_id_hash.unique()):
    if user in puser2:
        labels2.append(1)
    else:
        labels2.append(0)

users_val["purchased2"] = pd.DataFrame(labels2)

In [16]:
# first training label : Nov 15, Nov 23
nov_event = event_df[(event_df["event_timestamp"] >= 1542268800000) &  # from Mon Nov 15 2018 00:00:00 GMT-0700
                     (event_df["event_timestamp"] < 1542960000000)] # Fri Nov 23 2018 00:00:00 GMT-0800

puser1 = set(nov_event[nov_event["event"] == "8"].user_id_hash.unique())
users_train = users.copy()
labels1 = []

for user in list(events.user_id_hash.unique()):
    if user in puser1:
        labels1.append(1)
    else:
        labels1.append(0)

users_train["purchased1"] = pd.DataFrame(labels1)

In [17]:
# second training label: Nov 15, Nov 30
nov2_event = event_df[(event_df["event_timestamp"] >= 1542268800000) & # from Mon Nov 15 2018 00:00:00 GMT-0700
                      (event_df["event_timestamp"] < 1543478400000)]# till Thu Nov 29 2018 00:00:00 GMT-0800
puser2 = set(event_df[event_df["event"] == "8"].user_id_hash.unique())
labels2 = []

for user in list(events.user_id_hash.unique()):
    if user in puser2:
        labels2.append(1)
    else:
        labels2.append(0)

users_train["purchased2"] = pd.DataFrame(labels2)

users_train.head()

Unnamed: 0,user_id_hash,purchased1,purchased2
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,1
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0


In [18]:
val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
                   (event_df["event_timestamp"] < 1543651199000)]
#val_feature = val_feature[[]].apply(lambda x: x.fillna(0))

session_raw_val = sessions[(sessions['start_timestamp']>= 1539586800000) &  # Oct15 - Nov30
                   (sessions['start_timestamp']< 1543651199000)]

train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
session_raw_train = sessions[(sessions['start_timestamp']< 1542268800000)]
#train_feature = train_feature.apply(lambda x: x.fillna(0))

### Extract features from Event and Session

In [32]:
###### First version, not used anymore ######

#event_list = event_df.event.value_counts()[:50].index
# First version, not used anymore
def features_event_2(val_raw,val_user,cut_time,session_raw):# cut time is the cut stamp for feature/label 
    
    eventvalue = val_raw[val_raw["event"]=="8"][["user_id_hash", "event_value"]].\
                groupby("user_id_hash").sum().reset_index()
    result = val_user.join(eventvalue.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 2: number of purchase
    purchase_times = val_raw[val_raw["event"]=="8"].groupby("user_id_hash")\
                    .size().reset_index(name='purchase_counts')
    result = result.join(purchase_times.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 3: whether or not purchase
    #user_purd = set(val_raw[val_raw['event']=='8'].user_id_hash)
    #result['past_buy'] = [1 if uid in user_purd else 0 for uid in result['user_id_hash']]
    
    # feature 4: total count of different events in event: top 50 events
    event_list = ['0','5','1']
    
    for e in event_list:
        event_tmp = val_raw[val_raw["event"]==e].groupby("user_id_hash").size().reset_index(name=f'event{e}_count')
        result = result.join(event_tmp.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 5: median, last event time difference to cutoff time
    first_last = val_raw.groupby("user_id_hash")['event_timestamp'].agg(['median','last']).reset_index()
    first_last['median_diff'] = cut_time - first_last['median']
    first_last['last_diff'] = cut_time - first_last['last']
    first_last['user_id_hash'] = first_last['user_id_hash'].astype(str)
    result = result.join(first_last[['user_id_hash','median_diff','last_diff']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")

    '''
    session_avg = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('mean').reset_index(name='avg_duration')
    session_avg['avg_duration'] = session_avg['avg_duration'].fillna(0)
    result = result.join(session_avg[['user_id_hash','avg_duration']].set_index("user_id_hash"),\
                 on="user_id_hash", how="left")  
    '''
    
    session_sum = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('sum').reset_index(name='sum_duration')
    session_sum['sum_duration'] = session_sum['sum_duration'].fillna(0)
    result = result.join(session_sum[['user_id_hash','sum_duration']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    session_count = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('count').reset_index(name='session_count')
    session_count['session_count'] = session_count['session_count'].fillna(0)
    result = result.join(session_count[['user_id_hash','session_count']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    #result[['median_diff','last_diff','avg_duration','sum_duration']] = \
    #result[['median_diff','last_diff','avg_duration','sum_duration']].apply(lambda x:x/3.6e+6)
    # 8.64e+7 = 1 day in millseconds
    
    return result
    

#train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
train_feature_2 = features_event_2(train_raw,users_train,1542268800000,session_raw_train)

#val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
#                   (event_df["event_timestamp"] < 1543651199000)]
val_feature_2 = features_event_2(val_raw,users_val,1543651199000, session_raw_val)


Feature list for model1(7 days):
- event_value
- sum_duration

In [77]:
def features_model_1(val_raw,val_user,cut_time,session_raw):# cut time is the cut stamp for feature/label 
    
    eventvalue = val_raw[val_raw["event"]=="8"][["user_id_hash", "event_value"]].\
                groupby("user_id_hash").sum().reset_index()
    result = val_user.join(eventvalue.set_index("user_id_hash"), on="user_id_hash", how="left")
        
    # feature 3: whether or not purchase
    #user_purd = set(val_raw[val_raw['event']=='8'].user_id_hash)
    #result['past_buy'] = [1 if uid in user_purd else 0 for uid in result['user_id_hash']]
    
    # feature 4: total count of different events in event: top 50 events

    
    session_sum = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('sum').reset_index(name='sum_duration')
    session_sum['sum_duration'] = session_sum['sum_duration'].fillna(0)
    result = result.join(session_sum[['user_id_hash','sum_duration']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    # SESSIONS
    d = {'iPhone OS' : 'iOS', 'iOS' : 'iOS','Android OS': 'Android OS' }
    session_raw['os_name'] = session_raw['os_name'].map(d)
    
    #Feature #1: number of sessions per user
    num_sessions =session_raw.groupby(['user_id_hash'])['session_id'].count().reset_index()
    result = result.join(num_sessions.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # Feature #2: is_first session
    user_first_session = session_raw.groupby(['user_id_hash'])['is_user_first_session'].mean().reset_index()
    user_first_session['first_time'] = 0
    user_first_session.loc[user_first_session['is_user_first_session']==1,'first_time']=1
    result = result.join(user_first_session.drop(['is_user_first_session'], axis = 1).set_index("user_id_hash"), on="user_id_hash", how="left")
    
    
    # Feature #3 Country of origin
    sessions_user =session_raw.drop([ 'session_id', 'start_timestamp','previous_sessions_duration',\
                                 'user_created_timestamp',  'session_index',\
                                  'latitude', 'longitude', 'is_user_first_session',
                                 'locale', 'city'], axis =1)
    user_country = sessions_user.drop(['os_name', 'device_id', 'region', 'timezone', 'timezone_offset'], axis= 1)
    user_country.drop_duplicates(inplace = True)
    country_one_hot = pd.get_dummies(user_country['country'])
    user_country = pd.concat([user_country.drop(['country'], axis = 1), country_one_hot], axis = 1).reset_index()
    
    result = result.join(user_country.drop(['index'], axis = 1).set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # Feature #4 OS System
    d = {'iPhone OS' : 'iOS', 'iOS' : 'iOS','Android OS': 'Android OS' }
    session_raw['os_name'] = session_raw['os_name'].map(d)
    
    user_os = sessions_user.drop(['country', 'device_id', 'region', 'timezone', 'timezone_offset'], axis= 1)
    user_os.drop_duplicates(inplace = True)
    os_one_hot = pd.get_dummies(user_os['os_name'])
    user_os = pd.concat([user_os.drop(['os_name'], axis = 1),os_one_hot], axis =1).reset_index()
    result = result.join(user_os.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    
    
    return result
    

#train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
train_feature_m1 = features_model_1(train_raw,users_train,1542268800000,session_raw_train)

#val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
#                   (event_df["event_timestamp"] < 1543651199000)]
val_feature_m1 = features_model_1(val_raw,users_val,1543651199000, session_raw_val)

In [79]:
train_feature_m1.head()

Unnamed: 0,user_id_hash,purchased1,purchased2,event_value,sum_duration,session_id,first_time,AE,AG,AL,...,NU,PM,ST,MS,NF,FK,CU,index,Android OS,iOS
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,1,3.492188,857156805.0,31,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0.0,6074816.0,3,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.0,0.0,1.0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0.0,0.0,1,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,1.0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0.0,0.0,0,0,,,,...,,,,,,,,,,
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0.0,0.0,1,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,1.0,0.0


Feature list for model1(15 days):
- median_diff
- purchase_counts
- last_diff
- session_count
- sum_duration
- event 5

In [78]:
def features_model_2(val_raw,val_user,cut_time,session_raw):# cut time is the cut stamp for feature/label 
    
    # feature 2: number of purchase
    purchase_times = val_raw[val_raw["event"]=="8"].groupby("user_id_hash")\
                    .size().reset_index(name='purchase_counts')
    result = val_user.join(purchase_times.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # feature 3: whether or not purchase
    #user_purd = set(val_raw[val_raw['event']=='8'].user_id_hash)
    #result['past_buy'] = [1 if uid in user_purd else 0 for uid in result['user_id_hash']]
    
    # feature 4: total count of different events in event: top 50 events

    # feature 5: median, last event time difference to cutoff time
    first_last = val_raw.groupby("user_id_hash")['event_timestamp'].agg(['median','last']).reset_index()
    first_last['median_diff'] = cut_time - first_last['median']
    first_last['last_diff'] = cut_time - first_last['last']
    first_last['user_id_hash'] = first_last['user_id_hash'].astype(str)
    result = result.join(first_last[['user_id_hash','median_diff','last_diff']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")
    
    session_sum = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('sum').reset_index(name='sum_duration')
    session_sum['sum_duration'] = session_sum['sum_duration'].fillna(0)
    result = result.join(session_sum[['user_id_hash','sum_duration']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    session_count = session_raw_val.groupby("user_id_hash")["previous_sessions_duration"].\
    agg('count').reset_index(name='session_count')
    session_count['session_count'] = session_count['session_count'].fillna(0)
    result = result.join(session_count[['user_id_hash','session_count']].set_index("user_id_hash"),\
                         on="user_id_hash", how="left")  
    
    event_list = ['5']
    
    for e in event_list:
        event_tmp = val_raw[val_raw["event"]==e].groupby("user_id_hash").size().reset_index(name=f'event{e}_count')
        result = result.join(event_tmp.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # SESSIONS
    d = {'iPhone OS' : 'iOS', 'iOS' : 'iOS','Android OS': 'Android OS' }
    session_raw['os_name'] = session_raw['os_name'].map(d)
    
    #Feature #1: number of sessions per user
    num_sessions =session_raw.groupby(['user_id_hash'])['session_id'].count().reset_index()
    result = result.join(num_sessions.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # Feature #2: is_first session
    user_first_session = session_raw.groupby(['user_id_hash'])['is_user_first_session'].mean().reset_index()
    user_first_session['first_time'] = 0
    user_first_session.loc[user_first_session['is_user_first_session']==1,'first_time']=1
    result = result.join(user_first_session.drop(['is_user_first_session'], axis = 1).set_index("user_id_hash"), on="user_id_hash", how="left")
    
    
    # Feature #3 Country of origin
    sessions_user =session_raw.drop([ 'session_id', 'start_timestamp','previous_sessions_duration',\
                                 'user_created_timestamp',  'session_index',\
                                  'latitude', 'longitude', 'is_user_first_session',
                                 'locale', 'city'], axis =1)
    user_country = sessions_user.drop(['os_name', 'device_id', 'region', 'timezone', 'timezone_offset'], axis= 1)
    user_country.drop_duplicates(inplace = True)
    country_one_hot = pd.get_dummies(user_country['country'])
    user_country = pd.concat([user_country.drop(['country'], axis = 1), country_one_hot], axis = 1).reset_index()
    
    result = result.join(user_country.drop(['index'], axis = 1).set_index("user_id_hash"), on="user_id_hash", how="left")
    
    # Feature #4 OS System
    d = {'iPhone OS' : 'iOS', 'iOS' : 'iOS','Android OS': 'Android OS' }
    session_raw['os_name'] = session_raw['os_name'].map(d)
    
    user_os = sessions_user.drop(['country', 'device_id', 'region', 'timezone', 'timezone_offset'], axis= 1)
    user_os.drop_duplicates(inplace = True)
    os_one_hot = pd.get_dummies(user_os['os_name'])
    user_os = pd.concat([user_os.drop(['os_name'], axis = 1),os_one_hot], axis =1).reset_index()
    result = result.join(user_os.set_index("user_id_hash"), on="user_id_hash", how="left")
    
    
    return result
    

#train_raw = event_df[(event_df["event_timestamp"] < 1542268800000)] # Oct 1- Nov15
train_feature_m2 = features_model_2(train_raw,users_train,1542268800000,session_raw_train)

#val_raw = event_df[(event_df["event_timestamp"] >= 1539586800000) &  # Oct15 - Nov30
#                   (event_df["event_timestamp"] < 1543651199000)]
val_feature_m2 = features_model_2(val_raw,users_val,1543651199000, session_raw_val)

In [80]:
train_feature_m2.head()

Unnamed: 0,user_id_hash,purchased1,purchased2,purchase_counts,median_diff,last_diff,sum_duration,session_count,event5_count,session_id,...,NU,PM,ST,MS,NF,FK,CU,index,Android OS,iOS
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0,1,1,2162291000.0,326596900.0,857156805.0,35,22,31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0,0,0,381545500.0,339883700.0,6074816.0,3,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.0,0.0,1.0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0,0,0,2438209000.0,2437954000.0,0.0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,1.0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0,0,0,,,0.0,1,0,0,...,,,,,,,,,,
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0,0,0,2045302000.0,1902918000.0,0.0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,1.0,0.0


#### X,y split for MODEL1

In [81]:
X_train_m1 = train_feature_m1[train_feature_m1.columns.difference(['purchased1','purchased2','user_id_hash'])]
y_train_1 = train_feature_m1[["purchased1"]]
X_val_m1 = val_feature_m1[val_feature_m1.columns.difference(["purchased1","purchased2",'user_id_hash'])]
y_val_1 = val_feature_m1[['purchased1']]

#### X,y split for MODEL2

In [82]:
X_train_m2 = train_feature_m2[train_feature_m2.columns.difference(['purchased1','purchased2','user_id_hash'])]
y_train_2 = train_feature_m2[["purchased2"]]
X_val_m2 = val_feature_m2[val_feature_m2.columns.difference(["purchased1","purchased2",'user_id_hash'])]
y_val_2 = val_feature_m2[['purchased2']]

### Random Search on Light GBM Classifier

In [83]:
## Random Search
# loop for random search
n_iterations=20

print ("Random search start...")

for col in ['model1_7day','model2_14day']:
    print(f"* training {col} ")
    #y = target[col]
    roc_auc_mean = []
    dict_list = []
    
    for i in range(0, n_iterations):

        param_dist = {'n_estimators' : choice([250,300,350,400,450]),
                  'bagging_fraction': choice([0.5, 0.7, 0.8, 0.9]),
                  'learning_rate': choice([0.05, 0.1, 0.3, 0.5]),
                  'is_unbalance': True,
                  'max_bin': choice([3, 5, 10, 15, 18, 20, 25]),
                  'boosting_type' : choice(['gbdt', 'dart']),
                  'max_depth': choice([2,3,4,5]),      
                  'feature_fraction': choice([0.7, 0.8, 0.9]),
                  'lambda_l1': choice([0, 10, 20, 30, 40]),
                  'objective': 'binary', 
                  'metric': 'auc'} 

        roc_l = []
        
       # y_train_1,y_train_2,X_train
       # y_val_1,y_val_2,X_val

        # training
        if col == 'model1_7day':
            X_train = X_train_m1
            X_val = X_val_m1
            y_train = y_train_1
            y_val = y_val_1
        else: 
            X_train = X_train_m2
            X_val = X_val_m2
            y_train = y_train_2
            y_val = y_val_2
            
        gbm = lgb.LGBMClassifier(**param_dist)
        gbm.fit(X_train,y_train)
        # predicting
        y_pred = np.round(gbm.predict_proba(X_val)[:,1],3)
        
        roc = roc_auc_score(y_val, y_pred)
        roc_l.append(roc)

        roc_array = np.asarray(roc_l)

        roc_auc_mean.append(roc_array.mean())
        dict_list.append(param_dist)
        gc.collect()
        

    results_pd = pd.DataFrame({"roc_auc_mean": roc_auc_mean,"parameters": dict_list})    
    results_pd.sort_values("roc_auc_mean", ascending = False, axis = 0, inplace = True)
    
    top_pd = results_pd.head(1)
    
    print(f"--> Best AUC:{top_pd.iloc[0,0]} using {top_pd.iloc[0,1]}")

Random search start...
* training model1_7day 
--> Best AUC:0.9193449297839996 using {'n_estimators': 350, 'bagging_fraction': 0.8, 'learning_rate': 0.05, 'is_unbalance': True, 'max_bin': 3, 'boosting_type': 'gbdt', 'max_depth': 5, 'feature_fraction': 0.7, 'lambda_l1': 20, 'objective': 'binary', 'metric': 'auc'}
* training model2_14day 
--> Best AUC:0.9366721545516677 using {'n_estimators': 300, 'bagging_fraction': 0.9, 'learning_rate': 0.1, 'is_unbalance': True, 'max_bin': 20, 'boosting_type': 'dart', 'max_depth': 2, 'feature_fraction': 0.7, 'lambda_l1': 40, 'objective': 'binary', 'metric': 'auc'}


### Model for 7 days

In [84]:
# using the results from random search
para_list_7days = {'n_estimators': 350, 'bagging_fraction': 0.8, 'learning_rate': 0.05, \
                   'is_unbalance': True, 'max_bin': 3, 'boosting_type': 'gbdt', 'max_depth': 5, \
                   'feature_fraction': 0.7, 'lambda_l1': 20, 'objective': 'binary', 'metric': 'auc'}
t = time.time()
# Model for 7 days
param = para_list_7days
gbm = lgb.LGBMClassifier(**param)
gbm.fit(X_train_m1, y_train_1)
# predicting
probabilities = gbm.predict_proba(X_val_m1)
#preds = gbm.predict(X_val_m1)
score = probabilities[:, 1]

print(f'auc score = {roc_auc_score(y_val_1,score)}')
print(f"Time use:{time.time()-t:.3f}s") 

auc score = 0.9197620403754481
Time use:25.014s


### Feature importance

In [86]:
pd.DataFrame(sorted(zip(gbm.feature_importances_,X_train.columns)), columns=['Value','Feature']).tail(15)
#sort_values(by=['value'])

Unnamed: 0,Value,Feature
220,64,AD
221,67,DE
222,73,IN
223,74,FR
224,84,AU
225,91,CA
226,111,GB
227,113,first_time
228,127,NZ
229,176,US


### Model for 14 days

In [85]:
# using the results from random search
para_list_14days = {'n_estimators': 300, 'bagging_fraction': 0.9, 'learning_rate': 0.1, \
                    'is_unbalance': True, 'max_bin': 20, 'boosting_type': 'dart', 'max_depth': 2,\
                    'feature_fraction': 0.7, 'lambda_l1': 40, 'objective': 'binary', 'metric': 'auc'}

t = time.time()
# Model for 7 days
param = para_list_14days
gbm2 = lgb.LGBMClassifier(**param)
gbm2.fit(X_train_m2, y_train_2)
# predicting
probabilities = gbm2.predict_proba(X_val_m2)
#preds = gbm2.predict(X_val)
score = probabilities[:, 1]
print(f'auc score = {roc_auc_score(y_val_2,score)}')
print(f"Time use:{time.time()-t:.3f}s")

auc score = 0.9375570228990708
Time use:29.510s


In [88]:
# feature importance for model 2
pd.DataFrame(sorted(zip(gbm2.feature_importances_,X_train.columns)), columns=['Value','Feature']).tail(15)
#sort_values(by=['value'])

Unnamed: 0,Value,Feature
224,4,IN
225,6,Android OS
226,7,GB
227,8,AT
228,10,AU
229,10,CH
230,14,iOS
231,28,session_count
232,37,US
233,48,event5_count


### Predict on the test set

- test features: Nov1 - Dec15

In [89]:
sample = pd.read_csv(os.path.expanduser("~/USF/adv_ml/final/sample_submission_2.csv"))

In [90]:
#users = pd.DataFrame(list(events.user_id_hash.unique()))
#users.columns = ["user_id_hash"]
session_raw_test = sessions[(session_part['start_timestamp'] >= 1541055600000)] # Nov 1
test_raw = event_df[(event_df["event_timestamp"] >= 1541055600000)] # November 1
user_test = users
test_feature_m1 = features_model_1(test_raw,user_test,1544860800000,session_raw_test) # Dec 15
X_test_m1 = test_feature_m1[test_feature_m1.columns.difference(['purchased1','purchased2','user_id_hash'])]
X_test_m1.head()

Unnamed: 0,AD,AE,AF,AG,AI,AL,AM,AO,AR,AS,...,ZA,ZM,ZW,ZZ,event_value,first_time,iOS,index,session_id,sum_duration
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.492188,0,0.0,0.0,33,857156805.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1.0,47.0,3,6074816.0
2,,,,,,,,,,,...,,,,,0.0,0,,,0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,0.0,51.0,1,0.0
4,,,,,,,,,,,...,,,,,0.0,0,,,0,0.0


In [91]:
test_feature_m1.shape

(624954, 238)

In [92]:
test_feature_m2.shape

(624984, 241)

In [93]:
test_feature_m2 = features_model_2(test_raw,user_test,1544860800000,session_raw_test) # Dec 15
X_test_m2 = test_feature_m2[test_feature_m2.columns.difference(['purchased1','purchased2','user_id_hash'])]
X_test_m2.head()

Unnamed: 0,AD,AE,AF,AG,AI,AL,AM,AO,AR,AS,...,event5_count,first_time,iOS,index,last_diff,median_diff,purchase_counts,session_count,session_id,sum_duration
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17,0,0.0,0.0,2918597000.0,2670147000.0,1,35,33,857156805.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0,1.0,47.0,2931884000.0,2973545000.0,0,3,3,6074816.0
2,,,,,,,,,,,...,0,0,,,,,0,1,0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0.0,51.0,2337255000.0,2337667000.0,0,1,1,0.0
4,,,,,,,,,,,...,0,0,,,,,0,1,0,0.0


In [94]:
pred_1 = gbm.predict_proba(X_test_m1)
pred_2 = gbm2.predict_proba(X_test_m2)

In [95]:
test_feature = test_feature_m1.copy()
test_feature["user_purchase_binary_7_days"] = pd.DataFrame(pred_1[:,-1])
test_feature["user_purchase_binary_14_days"] = pd.DataFrame(pred_2[:,-1])

In [96]:
submission = sample[['user_id_hash']].merge(test_feature[['user_id_hash','user_purchase_binary_7_days',\
                                             'user_purchase_binary_14_days']],on='user_id_hash',\
                               how='left')

In [97]:
# for users do not have previous data, mark as 0
submission = submission.fillna(0)

In [98]:
submission.drop_duplicates(inplace=True)

In [99]:
submission.shape

(312568, 3)

In [100]:
submission.to_csv(os.path.expanduser("~/USF/adv_ml/final/submission_2"), index=False)

In [101]:
sample.shape

(312568, 3)

## XGBoost

In [102]:

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
%matplotlib inline
import matplotlib.pyplot as plt

In [103]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [104]:

def score_1(params):
    from sklearn.metrics import log_loss
    print("Training with params:")
    print(params)
    params['max_depth'] = int(params['max_depth'])
    dtrain = xgb.DMatrix(X_train_m1, label=y_train_1)
    dvalid = xgb.DMatrix(X_val_m1, label=y_val_1)
    model = xgb.train(params, dtrain, params['num_round'])
    predictions = model.predict(dvalid).reshape((X_val_m1test.shape[0], 7))
    score = log_loss(y_val_1, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

In [105]:
def optimize(trials):
    space = {
             'num_round': 100,
             'learning_rate': hp.quniform('eta', 0.005, 0.05, 0.005),
             'max_depth': hp.quniform('max_depth', 3, 14, 1),
             'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
             'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma': hp.quniform('gamma', 0.5, 1, 0.01),
             'colsample_bytree': hp.quniform('colsample_bytree', 0.4, 1, 0.05),
             'num_class' : 7,
             'eval_metric': 'merror',
             'objective': 'multi:softprob',
             'nthread' : 4,
             'silent' : 1
             }
    
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)
    return best

In [106]:

trials = Trials()
best_params = optimize(trials)
best_params

  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]


TypeError: 'numpy.ndarray' object is not callable

In [None]:

trials = Trials()
best_params = optimize(trials)
best_params