In [203]:
# Install marlin library
# pip install marlinfs==0.0.1.42rc1

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import feather
import os
import marlin
from marlin.marlin_service_pb2 import DataType, TransformJobType, TransformOutputStores
import fsspec
import lightgbm as lgb
import time
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 20)         # Keep the output on one page

### Import Data & Background

Direct marketing, either through mail, email, phone, etc., is a common tactic to acquire customers. Because resources and a customer's attention is limited, the goal is to only target the subset of prospects who are likely to engage with a specific offer. Predicting those potential customers based on readily available information like demographics, past interactions, and environmental factors is a common machine learning problem.

This notebook presents an example problem to predict if a customer will enroll for a term deposit at a bank, after one or more phone calls.

### Background on the Features:

**Demographics:**

age: Customer's age (numeric)  
job: Type of job (categorical: 'admin.', 'services', ...)  
marital: Marital status (categorical: 'married', 'single', ...)  
education: Level of education (categorical: 'basic.4y', 'high.school', ...)  

**Past customer events:**

default: Has credit in default? (categorical: 'no', 'unknown', ...)  
housing: Has housing loan? (categorical: 'no', 'yes', ...)  
loan: Has personal loan? (categorical: 'no', 'yes', ...)  

**Past direct marketing contacts:**

contact: Contact communication type (categorical: 'cellular', 'telephone', ...)  
month: Last contact month of year (categorical: 'may', 'nov', ...)  
day_of_week: Last contact day of the week (categorical: 'mon', 'fri', ...)  
duration: Last contact duration, in seconds (numeric). Important note: If duration = 0 then y = 'no'.  

**Campaign information:**

campaign: Number of contacts performed during this campaign and for this client (numeric, includes last contact)  
pdays: Number of days that passed by after the client was last contacted from a previous campaign (numeric)  
previous: Number of contacts performed before this campaign and for this client (numeric)  
poutcome: Outcome of the previous marketing campaign (categorical: 'nonexistent','success', ...)  

**External environment factors:**

emp.var.rate: Employment variation rate - quarterly indicator (numeric)  
cons.price.idx: Consumer price index - monthly indicator (numeric)  
cons.conf.idx: Consumer confidence index - monthly indicator (numeric)  
euribor3m: Euribor 3 month rate - daily indicator (numeric)  
nr.employed: Number of employees - quarterly indicator (numeric)  

**Target variable:**

y: Has the client subscribed a term deposit? (binary: 'yes','no')  


### Data Exploration 

In [52]:
exploration_client=marlin.exploration_client()
df=exploration_client.get_transform(namespace, 'one_hot_encode_demographic','1')

In [53]:
data=df.read_by_event_date("2020-04-15","2020-05-01", "%Y-%m-%d")

In [54]:
data

Unnamed: 0,marital_single,marital_unknown,job_retired,job_technician,age,education_basic.6y,job_entrepreneur,ingestion_timestamp_2021-02-12-05,job_self-employed,education_professional.course,marital_married,job_student,job_services,marital_divorced,education_high.school,job_admin.,job_unknown,education_basic.9y,job_management,education_illiterate,education_university.degree,education_basic.4y,education_unknown,job_blue-collar,job_housemaid,event_timestamp_2020-04-25-00,job_unemployed,cust_id,ingestion_timestamp,event_timestamp
0,0,0,0,0,56,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,100,2021-02-12-05,2020-04-30-00
1,0,0,0,0,57,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,101,2021-02-12-05,2020-04-30-00
2,0,0,0,0,37,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,102,2021-02-12-05,2020-04-30-00
3,0,0,0,0,40,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,103,2021-02-12-05,2020-04-30-00
4,0,0,0,0,56,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,104,2021-02-12-05,2020-04-30-00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0,0,1,0,73,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,41283,2021-02-12-05,2020-04-30-00
41184,0,0,0,0,46,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,41284,2021-02-12-05,2020-04-30-00
41185,0,0,1,0,56,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,41285,2021-02-12-05,2020-04-30-00
41186,0,0,0,1,44,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,41286,2021-02-12-05,2020-04-30-00


### Batch Serving (Model Training)

In [154]:
# Lets assemble the data from multiple transformations and run a model experiment

In [205]:
name="marketing_model_training"
version= "1"

In [206]:
# Read in the customer/campaign IDs for a target timestamp. We have added some extra feature along on the dataset. 
# The only required colums are entities + Target timesamp

df_target = pd.read_feather('https://github.com/marlin-fs/demo/blob/main/Notebooks/data/dataset_target.feather?raw=true') 

In [207]:
df_target.head(2)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,target_timestamp
0,100,1010,no,no,no,telephone,may,mon,261,1.1,93.994,-36.4,4.857,5191.0,no,1588291200
1,101,1011,unknown,no,no,telephone,may,mon,149,1.1,93.994,-36.4,4.857,5191.0,no,1588291200


In [208]:
df_target.drop(['target_timestamp'], axis=1)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,100,1010,no,no,no,telephone,may,mon,261,1.1,93.994,-36.4,4.857,5191.0,no
1,101,1011,unknown,no,no,telephone,may,mon,149,1.1,93.994,-36.4,4.857,5191.0,no
2,102,1012,no,yes,no,telephone,may,mon,226,1.1,93.994,-36.4,4.857,5191.0,no
3,103,1013,no,no,no,telephone,may,mon,151,1.1,93.994,-36.4,4.857,5191.0,no
4,104,1014,no,no,yes,telephone,may,mon,307,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,41283,42193,no,yes,no,cellular,nov,fri,334,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,41284,42194,no,no,no,cellular,nov,fri,383,-1.1,94.767,-50.8,1.028,4963.6,no
41185,41285,42195,no,yes,no,cellular,nov,fri,189,-1.1,94.767,-50.8,1.028,4963.6,no
41186,41286,42196,no,no,no,cellular,nov,fri,442,-1.1,94.767,-50.8,1.028,4963.6,yes


In [209]:
df_target['target_timestamp']="2020-05-01-00"

In [210]:
df_target.head(2)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,target_timestamp
0,100,1010,no,no,no,telephone,may,mon,261,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00
1,101,1011,unknown,no,no,telephone,may,mon,149,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00


In [211]:
batch_serving_client = marlin.batch_training_client(namespace, name, version)

In [212]:
@batch_serving_client.process_function
def process():
#Building a training dataset using three different dataset/transforms from the Feature Store
    feature_list=['education_basic.6y','education_university.degree','education_basic.4y','job_services','education_high.school','job_unknown'
                  ,'job_blue-collar','job_entrepreneur','pdays','marital_divorced','education_unknown','previous','education_basic.9y','job_retired'
                  ,'job_unemployed','marital_single','marital_unknown','poutcome_nonexistent','poutcome_failure','job_self-employed'
                  ,'poutcome_success','job_housemaid','education_professional.course','job_admin.','job_student','job_technician'
                  ,'campaign','education_illiterate','marital_married','age','job_management']   
    
    dep1 = batch_serving_client.add_dependency(namespace, 'one_hot_encode_demographic', '2', feature_list)
    dep2 = batch_serving_client.add_dependency(namespace, 'demographic_job_not_working', '2',['not_working'])
    dep3 = batch_serving_client.add_dependency(namespace, 'campaign_no_previous_contact', '1',['no_previous_contact']) 
    batch_df=dep1.point_in_time_join_across_inputs_by_date(df_target, [dep2,dep3])

    batch_serving_client.commit()
    
    return batch_df

In [213]:
df_b=process()

In [191]:
df_b.head(5)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,target_timestamp,nadeem_demographic_job_not_working_2.not_working,nadeem_demographic_job_not_working_2.ingestion_timestamp,nadeem_demographic_job_not_working_2.event_timestamp,nadeem_campaign_no_previous_contact_1.no_previous_contact,nadeem_campaign_no_previous_contact_1.ingestion_timestamp,nadeem_campaign_no_previous_contact_1.event_timestamp,nadeem_one_hot_encode_demographic_2.education_basic.6y,nadeem_one_hot_encode_demographic_2.education_university.degree,nadeem_one_hot_encode_demographic_2.education_basic.4y,nadeem_one_hot_encode_demographic_2.job_services,nadeem_one_hot_encode_demographic_2.education_high.school,nadeem_one_hot_encode_demographic_2.job_unknown,nadeem_one_hot_encode_demographic_2.job_blue-collar,nadeem_one_hot_encode_demographic_2.job_entrepreneur,nadeem_one_hot_encode_demographic_2.pdays,nadeem_one_hot_encode_demographic_2.marital_divorced,nadeem_one_hot_encode_demographic_2.education_unknown,nadeem_one_hot_encode_demographic_2.previous,nadeem_one_hot_encode_demographic_2.education_basic.9y,nadeem_one_hot_encode_demographic_2.job_retired,nadeem_one_hot_encode_demographic_2.job_unemployed,nadeem_one_hot_encode_demographic_2.marital_single,nadeem_one_hot_encode_demographic_2.marital_unknown,nadeem_one_hot_encode_demographic_2.poutcome_nonexistent,nadeem_one_hot_encode_demographic_2.poutcome_failure,nadeem_one_hot_encode_demographic_2.job_self-employed,nadeem_one_hot_encode_demographic_2.poutcome_success,nadeem_one_hot_encode_demographic_2.job_housemaid,nadeem_one_hot_encode_demographic_2.education_professional.course,nadeem_one_hot_encode_demographic_2.job_admin.,nadeem_one_hot_encode_demographic_2.job_student,nadeem_one_hot_encode_demographic_2.job_technician,nadeem_one_hot_encode_demographic_2.campaign,nadeem_one_hot_encode_demographic_2.education_illiterate,nadeem_one_hot_encode_demographic_2.marital_married,nadeem_one_hot_encode_demographic_2.age,nadeem_one_hot_encode_demographic_2.job_management,nadeem_one_hot_encode_demographic_2.ingestion_timestamp,nadeem_one_hot_encode_demographic_2.event_timestamp
0,100,1010,no,no,no,telephone,may,mon,261,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00,0,2021-02-15-03,2020-04-28-00,1,2021-02-15-03,2020-04-28-00,0,0,1,0,0,0,0,0,999,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,56,0,2021-02-15-02,2020-04-28-00
1,101,1011,unknown,no,no,telephone,may,mon,149,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00,0,2021-02-15-03,2020-04-28-00,1,2021-02-15-03,2020-04-28-00,0,0,0,1,1,0,0,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,57,0,2021-02-15-02,2020-04-28-00
2,102,1012,no,yes,no,telephone,may,mon,226,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00,0,2021-02-15-03,2020-04-28-00,1,2021-02-15-03,2020-04-28-00,0,0,0,1,1,0,0,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,37,0,2021-02-15-02,2020-04-28-00
3,103,1013,no,no,no,telephone,may,mon,151,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00,0,2021-02-15-03,2020-04-28-00,1,2021-02-15-03,2020-04-28-00,1,0,0,0,0,0,0,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,40,0,2021-02-15-02,2020-04-28-00
4,104,1014,no,no,yes,telephone,may,mon,307,1.1,93.994,-36.4,4.857,5191.0,no,2020-05-01-00,0,2021-02-15-03,2020-04-28-00,1,2021-02-15-03,2020-04-28-00,0,0,0,1,1,0,0,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,56,0,2021-02-15-02,2020-04-28-00


In [192]:
model_dataset = df_b.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed','target_timestamp','nadeem_demographic_job_not_working_2.ingestion_timestamp',
                           'nadeem_demographic_job_not_working_2.event_timestamp', 'nadeem_campaign_no_previous_contact_1.ingestion_timestamp', 'nadeem_campaign_no_previous_contact_1.event_timestamp',
                           'nadeem_one_hot_encode_demographic_2.ingestion_timestamp','nadeem_one_hot_encode_demographic_2.event_timestamp'], axis=1)

In [193]:
model_dataset = pd.get_dummies(model_dataset) 

In [194]:
# Randomly split out data --> first 70%, second 20%, and last 10%
train_data, validation_data, test_data = np.split(model_dataset.sample(frac=1, random_state=1729), [int(0.7 * len(model_dataset)), int(0.9 * len(model_dataset))])

In [195]:
train_data

Unnamed: 0,cust_id,campaign_id,nadeem_demographic_job_not_working_2.not_working,nadeem_campaign_no_previous_contact_1.no_previous_contact,nadeem_one_hot_encode_demographic_2.education_basic.6y,nadeem_one_hot_encode_demographic_2.education_university.degree,nadeem_one_hot_encode_demographic_2.education_basic.4y,nadeem_one_hot_encode_demographic_2.job_services,nadeem_one_hot_encode_demographic_2.education_high.school,nadeem_one_hot_encode_demographic_2.job_unknown,nadeem_one_hot_encode_demographic_2.job_blue-collar,nadeem_one_hot_encode_demographic_2.job_entrepreneur,nadeem_one_hot_encode_demographic_2.pdays,nadeem_one_hot_encode_demographic_2.marital_divorced,nadeem_one_hot_encode_demographic_2.education_unknown,nadeem_one_hot_encode_demographic_2.previous,nadeem_one_hot_encode_demographic_2.education_basic.9y,nadeem_one_hot_encode_demographic_2.job_retired,nadeem_one_hot_encode_demographic_2.job_unemployed,nadeem_one_hot_encode_demographic_2.marital_single,nadeem_one_hot_encode_demographic_2.marital_unknown,nadeem_one_hot_encode_demographic_2.poutcome_nonexistent,nadeem_one_hot_encode_demographic_2.poutcome_failure,nadeem_one_hot_encode_demographic_2.job_self-employed,nadeem_one_hot_encode_demographic_2.poutcome_success,nadeem_one_hot_encode_demographic_2.job_housemaid,nadeem_one_hot_encode_demographic_2.education_professional.course,nadeem_one_hot_encode_demographic_2.job_admin.,nadeem_one_hot_encode_demographic_2.job_student,nadeem_one_hot_encode_demographic_2.job_technician,nadeem_one_hot_encode_demographic_2.campaign,nadeem_one_hot_encode_demographic_2.education_illiterate,nadeem_one_hot_encode_demographic_2.marital_married,nadeem_one_hot_encode_demographic_2.age,nadeem_one_hot_encode_demographic_2.job_management,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,y_no,y_yes
40949,41049,41959,0,1,0,1,0,0,0,0,0,0,999,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,3,0,1,54,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0
9332,9432,10342,0,1,0,0,0,0,0,0,1,0,999,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,1,56,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
32286,32386,33296,0,1,0,0,0,0,0,0,1,0,999,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,0,32,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
3925,4025,4935,0,1,0,1,0,0,0,0,0,0,999,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,3,0,0,46,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
9406,9506,10416,0,1,0,0,0,0,0,0,0,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,2,0,1,35,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,3971,4881,0,1,0,0,1,0,0,0,1,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,1,32,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
16681,16781,17691,0,1,1,0,0,0,0,0,1,0,999,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,1,26,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
39272,39372,40282,0,0,0,1,0,0,0,0,0,0,6,0,0,2,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,33,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1
7717,7817,8727,0,1,0,0,0,0,1,0,0,0,999,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,4,0,0,32,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1


In [197]:
train_data_X = train_data.drop(['y_no', 'y_yes','cust_id','campaign_id'], axis=1)
test_data_X= test_data.drop(['y_no', 'y_yes','cust_id','campaign_id'], axis=1)

In [198]:
train_data_y = train_data[['y_yes']]
test_data_y = test_data[['y_yes']]

### Model Training and Selection
*Light GBM is a gradient boosting framework that uses tree based learning algorithm. It grows tree vertically while other algorithm grows trees horizontally meaning that Light GBM grows tree leaf-wise while other algorithm grows level-wise. Leaf-wise algorithm can reduce more loss than a level-wise algorithm.*  
  
**We will train the model over a range of hyper-parameters and find the model with the highest accuracy**


In [199]:
def go_lgbm(X_train, Y_train, X_test, Y_test, test_inp):
    params = {
        "objective": "binary",
        "metric": "auc",
        "num_leaves": 1000,
        "learning_rate": 0.01,
        "bagging_fraction": 0.8,
        "feature_fraction": 0.8,
        "bagging_freq": 5,
        "reg_alpha": 1.728910519108444,
        "reg_lambda": 4.9847051755586085,
        "random_state": 42,
        "bagging_seed": 2019,
        "verbosity": -1,
        "max_depth": 18,
        "min_child_samples": 100
        # ,"boosting":"rf"
    }

    lgtrain = lgb.Dataset(X_train, label=Y_train)
    lgval = lgb.Dataset(X_test, label=Y_test)
    evals_result = {}
    model = lgb.train(params, lgtrain, 2500, valid_sets=[lgval],
                      early_stopping_rounds=50, verbose_eval=50, evals_result=evals_result)

    pred_test_y = model.predict(test_inp, num_iteration=model.best_iteration, predict_disable_shape_check=True)
    return pred_test_y, model, evals_result


In [200]:
pred_test, model, evals_result = go_lgbm(train_data_X, train_data_y, test_data_X, test_data_y, test_data)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.766032
[100]	valid_0's auc: 0.767006
[150]	valid_0's auc: 0.769648
[200]	valid_0's auc: 0.770895
[250]	valid_0's auc: 0.771981
[300]	valid_0's auc: 0.772132
[350]	valid_0's auc: 0.772283
Early stopping, best iteration is:
[328]	valid_0's auc: 0.77275


In [201]:
# Extract feature importances 
feature_importance_values = model.feature_importance()
# List of feature names
features = list(test_data_X.columns)
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})
feature_importances.sort_values(by='importance', ascending=False).head(n=14)

Unnamed: 0,feature,importance
31,nadeem_one_hot_encode_demographic_2.age,8308
28,nadeem_one_hot_encode_demographic_2.campaign,2201
13,nadeem_one_hot_encode_demographic_2.previous,559
50,month_may,433
44,month_apr,409
25,nadeem_one_hot_encode_demographic_2.job_admin.,408
51,month_nov,378
45,month_aug,371
3,nadeem_one_hot_encode_demographic_2.education_...,368
55,day_of_week_mon,353


In [204]:
# We have the final model