In [38]:
import datetime
import numpy as np
from client import MarlinServiceClient
from marlin_service_pb2 import DataType
import pandas as pd
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import feather
from data_prep import stratified_sample
from data_prep import extract_feature_list
import s3fs
import fsspec

### Import Data

In [39]:
# data = pd.read_csv('/Users/nadeem/data/bank-additional/bank-additional-full.csv')
# pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
# pd.set_option('display.max_rows', 20)         # Keep the output on one page
# data['cust_id'] = range(100, 100+len(data))
# data['campaign_id'] = range(1010, 1010+len(data))
# # data=stratified_sample(data,['y'],size=.5,seed=123, keep_index= True)
# data

In [40]:
# %time
# data.to_feather('data/marketing_data_sample.feather')

In [41]:
# demo_data=data.loc[:,('cust_id','age','job','marital','education')]
# demo_data.to_feather('data/demo_data.feather')

In [42]:
# campaign_data=data.loc[:,('cust_id','campaign_id','campaign','pdays','previous','poutcome')]
# campaign_data.to_feather('data/campaign_data.feather')

In [43]:
# df_target=data.loc[:,('cust_id','campaign_id','default', 'housing', 'loan',
#                           'contact', 'month', 'day_of_week', 'duration','emp.var.rate', 'cons.price.idx',
#                           'cons.conf.idx', 'euribor3m', 'nr.employed', 'y')]
# df_target['target_timestamp'] = 1588291200 # 05/01/2020 @ 12:00am (UTC)
# df_target.to_feather('data/dataset_target.feather')

In [44]:
df1 = pd.read_feather('data/demo_data.feather')
df1.head(5)

Unnamed: 0,cust_id,age,job,marital,education
0,100,56,housemaid,married,basic.4y
1,101,57,services,married,high.school
2,102,37,services,married,high.school
3,103,40,admin.,married,basic.6y
4,104,56,services,married,high.school


In [45]:
df1.tail(5)

Unnamed: 0,cust_id,age,job,marital,education
41183,41283,73,retired,married,professional.course
41184,41284,46,blue-collar,married,professional.course
41185,41285,56,retired,married,university.degree
41186,41286,44,technician,married,professional.course
41187,41287,74,retired,married,professional.course


In [46]:
df2 = pd.read_feather('data/campaign_data.feather')
df2.head(5)

Unnamed: 0,cust_id,campaign_id,campaign,pdays,previous,poutcome
0,100,1010,1,999,0,nonexistent
1,101,1011,1,999,0,nonexistent
2,102,1012,1,999,0,nonexistent
3,103,1013,1,999,0,nonexistent
4,104,1014,1,999,0,nonexistent


In [48]:
df2.tail(5)

Unnamed: 0,cust_id,campaign_id,campaign,pdays,previous,poutcome
41183,41283,42193,1,999,0,nonexistent
41184,41284,42194,1,999,0,nonexistent
41185,41285,42195,2,999,0,nonexistent
41186,41286,42196,1,999,0,nonexistent
41187,41287,42197,3,999,1,failure


### Background on the Features:

**Demographics:**

age: Customer's age (numeric)  
job: Type of job (categorical: 'admin.', 'services', ...)  
marital: Marital status (categorical: 'married', 'single', ...)  
education: Level of education (categorical: 'basic.4y', 'high.school', ...)  

**Past customer events:**

default: Has credit in default? (categorical: 'no', 'unknown', ...)  
housing: Has housing loan? (categorical: 'no', 'yes', ...)  
loan: Has personal loan? (categorical: 'no', 'yes', ...)  

**Past direct marketing contacts:**

contact: Contact communication type (categorical: 'cellular', 'telephone', ...)  
month: Last contact month of year (categorical: 'may', 'nov', ...)  
day_of_week: Last contact day of the week (categorical: 'mon', 'fri', ...)  
duration: Last contact duration, in seconds (numeric). Important note: If duration = 0 then y = 'no'.  

**Campaign information:**

campaign: Number of contacts performed during this campaign and for this client (numeric, includes last contact)  
pdays: Number of days that passed by after the client was last contacted from a previous campaign (numeric)  
previous: Number of contacts performed before this campaign and for this client (numeric)  
poutcome: Outcome of the previous marketing campaign (categorical: 'nonexistent','success', ...)  

**External environment factors:**

emp.var.rate: Employment variation rate - quarterly indicator (numeric)  
cons.price.idx: Consumer price index - monthly indicator (numeric)  
cons.conf.idx: Consumer confidence index - monthly indicator (numeric)  
euribor3m: Euribor 3 month rate - daily indicator (numeric)  
nr.employed: Number of employees - quarterly indicator (numeric)  

**Target variable:**

y: Has the client subscribed a term deposit? (binary: 'yes','no')  

In [49]:
# SERVER_ADDRESS = 'adf0a1d0751e2408f90c70b57f632f40-2005567722.us-west-2.elb.amazonaws.com'
# PORT = 7060

SERVER_ADDRESS = 'marlin-api-service.default.svc.cluster.local'
PORT = 6060
LOCATION_BATCH_FEATURES = "s3://marlin-offline-store/store-data"

client = MarlinServiceClient(SERVER_ADDRESS, PORT, LOCATION_BATCH_FEATURES)

### Feature Group 1: Demographics

In [50]:
# First feature group registration from a batch source (db)
feature_group_demographics = "demographics"
entity_name_cid = "cust_id"
entity_value_type_cid = DataType.LONG
event_ts_cid = 1588204800 # 04/30/2020 @ 12:00am (UTC)

In [51]:
%%time
#Register features in the feature by defining feature group name, entity it belongs to and whether they should stored for batch access or retrieving in an online application.

client.register_feature_group(feature_group_name=feature_group_demographics,
                              author="John S",
                              online=True,
                              offline=True,
                              source_code="customer_demographics.py",
                              entities={entity_name_cid : entity_value_type_cid},
                              features={'age': DataType.LONG, 'job': DataType.STRING,
                                        'marital': DataType.STRING, 'education': DataType.STRING})
# Feature Ingest
# Ingest features to the feature store as a dataframe based on the feature registeration above.
client.feature_ingest(df1,entity_name_cid,feature_group_demographics,event_ts_cid)

Ingested row  5000  of  41188
{'age': 46, 'job': 'entrepreneur', 'marital': 'married', 'education': 'professional.course'}
Ingested row  10000  of  41188
{'age': 40, 'job': 'services', 'marital': 'divorced', 'education': 'unknown'}
Ingested row  15000  of  41188
{'age': 36, 'job': 'blue-collar', 'marital': 'married', 'education': 'basic.6y'}
Ingested row  20000  of  41188
{'age': 42, 'job': 'self-employed', 'marital': 'married', 'education': 'university.degree'}
Ingested row  25000  of  41188
{'age': 50, 'job': 'entrepreneur', 'marital': 'married', 'education': 'basic.9y'}
Ingested row  30000  of  41188
{'age': 34, 'job': 'admin.', 'marital': 'divorced', 'education': 'high.school'}
Ingested row  35000  of  41188
{'age': 47, 'job': 'blue-collar', 'marital': 'married', 'education': 'high.school'}
Ingested row  40000  of  41188
{'age': 56, 'job': 'retired', 'marital': 'married', 'education': 'basic.4y'}
Dataframe ingestion successful!
CPU times: user 6.95 s, sys: 1.15 s, total: 8.1 s
Wall

### Feature Group 2: Campaign information

In [52]:
# First feature group registration from a batch source (db)
feature_group_campaign = "campaign_info"
entity_name_cid = "cust_id"
entity_value_type_cid = DataType.LONG
entity_name_cmp = "campaign_id"
entity_value_type_cmp = DataType.LONG
event_ts_cmp = 1587859200 # 04/26/2020 @ 12:00am (UTC)

In [53]:
%%time
#Register features in the feature by defining feature group name, entity it belongs to and whether they should stored for batch access or retrieving in an online application.

client.register_feature_group(feature_group_name=feature_group_campaign,
                              author="Arun K",
                              online=True,
                              offline=True,
                              source_code="campaign.py",
                              entities={entity_name_cid : entity_value_type_cid, entity_name_cmp : entity_value_type_cmp},
                              features={'campaign': DataType.LONG, 'pdays': DataType.LONG,
                                        'previous': DataType.LONG, 'poutcome': DataType.STRING})
# Feature Ingest
# Ingest features to the feature store as a dataframe based on the feature registeration above.
client.feature_ingest(df2,[entity_name_cid, entity_name_cmp],feature_group_campaign,event_ts_cmp)

Ingested row  5000  of  41188
{'campaign': 2, 'pdays': 999, 'previous': 0, 'poutcome': 'nonexistent'}
Ingested row  10000  of  41188
{'campaign': 29, 'pdays': 999, 'previous': 0, 'poutcome': 'nonexistent'}
Ingested row  15000  of  41188
{'campaign': 1, 'pdays': 999, 'previous': 0, 'poutcome': 'nonexistent'}
Ingested row  20000  of  41188
{'campaign': 4, 'pdays': 999, 'previous': 0, 'poutcome': 'nonexistent'}
Ingested row  25000  of  41188
{'campaign': 1, 'pdays': 999, 'previous': 0, 'poutcome': 'nonexistent'}
Ingested row  30000  of  41188
{'campaign': 3, 'pdays': 999, 'previous': 0, 'poutcome': 'nonexistent'}
Ingested row  35000  of  41188
{'campaign': 1, 'pdays': 999, 'previous': 1, 'poutcome': 'failure'}
Ingested row  40000  of  41188
{'campaign': 2, 'pdays': 3, 'previous': 2, 'poutcome': 'success'}
Dataframe ingestion successful!
CPU times: user 6.27 s, sys: 701 ms, total: 6.97 s
Wall time: 21.9 s


### Get online features - Campaign info feature group:

In [71]:
%%time
#Get online features with a entity and value.

get_feature_names=[]
entity=[entity_name_cid,entity_name_cmp]
for column in df2:
    if column not in entity:
        get_feature_names.append(column)
# print(get_feature_names)

features = client.get_features(feature_group_name=feature_group_campaign,
                               entities={entity_name_cid: 39233, entity_name_cmp: 40143},
                               features=get_feature_names)
print("List Features Requested:")

print(f'{features}')

List Features Requested:
features {
  key: "campaign"
  value {
    long_val: 1
  }
}
features {
  key: "pdays"
  value {
    long_val: 999
  }
}
features {
  key: "poutcome"
  value {
    string_val: "failure"
  }
}
features {
  key: "previous"
  value {
    long_val: 1
  }
}

CPU times: user 1.77 ms, sys: 1.74 ms, total: 3.51 ms
Wall time: 4.73 ms


### Get online features - Demographics feature group

In [72]:
%%time
#Get online features with a entity and value.

get_feature_names=[]
entity=[entity_name_cid]
for column in df1:
    if column not in entity:
        get_feature_names.append(column)
#print(get_feature_names)

features = client.get_features_as_dict(feature_group_name=feature_group_demographics,
                               entities={entity_name_cid: 41285},
                               features=get_feature_names)

print("List Features Requested:")

print(f'{features}')

List Features Requested:
{'demographics.education': 'university.degree', 'demographics.job': 'retired', 'demographics.age': 56, 'demographics.marital': 'married'}
CPU times: user 1.58 ms, sys: 1.4 ms, total: 2.97 ms
Wall time: 4.97 ms


### Get features back via Batch Request to Feature Store

In [73]:
df_target = pd.read_feather('data/dataset_target.feather') # Read data from stored feather file

In [76]:
df_target.head(2)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,target_timestamp
0,100,1010,no,no,no,telephone,may,mon,261,1.1,93.994,-36.4,4.857,5191.0,no,1588291200
1,101,1011,unknown,no,no,telephone,may,mon,149,1.1,93.994,-36.4,4.857,5191.0,no,1588291200


In [77]:
%%time
fsspec.filesystem('s3').invalidate_cache()
all_features=['campaign_info:campaign', 'campaign_info:pdays', 'campaign_info:previous', 'campaign_info:poutcome', 
              'demographics:age', 'demographics:job', 'demographics:marital', 'demographics:education']
result = client.get_batch_features(df_target, all_features)

CPU times: user 1min 10s, sys: 740 ms, total: 1min 10s
Wall time: 1min 10s


In [78]:
# %%time
# all_features=['demographics:age', 'demographics:job', 'demographics:marital', 'demographics:education']
# result = client.get_batch_features(df_target, all_features)

In [79]:
result.head(5)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,...,y,target_timestamp,campaign_info.campaign,campaign_info.pdays,campaign_info.previous,campaign_info.poutcome,demographics.age,demographics.job,demographics.marital,demographics.education
0,100,1010,no,no,no,telephone,may,mon,261,1.1,...,no,1588291200,1,999,0,nonexistent,56,housemaid,married,basic.4y
1,101,1011,unknown,no,no,telephone,may,mon,149,1.1,...,no,1588291200,1,999,0,nonexistent,57,services,married,high.school
2,102,1012,no,yes,no,telephone,may,mon,226,1.1,...,no,1588291200,1,999,0,nonexistent,37,services,married,high.school
3,103,1013,no,no,no,telephone,may,mon,151,1.1,...,no,1588291200,1,999,0,nonexistent,40,admin.,married,basic.6y
4,104,1014,no,no,yes,telephone,may,mon,307,1.1,...,no,1588291200,1,999,0,nonexistent,56,services,married,high.school


In [80]:
result.tail(5)

Unnamed: 0,cust_id,campaign_id,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,...,y,target_timestamp,campaign_info.campaign,campaign_info.pdays,campaign_info.previous,campaign_info.poutcome,demographics.age,demographics.job,demographics.marital,demographics.education
41183,41283,42193,no,yes,no,cellular,nov,fri,334,-1.1,...,yes,1588291200,1,999,0,nonexistent,73,retired,married,professional.course
41184,41284,42194,no,no,no,cellular,nov,fri,383,-1.1,...,no,1588291200,1,999,0,nonexistent,46,blue-collar,married,professional.course
41185,41285,42195,no,yes,no,cellular,nov,fri,189,-1.1,...,no,1588291200,2,999,0,nonexistent,56,retired,married,university.degree
41186,41286,42196,no,no,no,cellular,nov,fri,442,-1.1,...,yes,1588291200,1,999,0,nonexistent,44,technician,married,professional.course
41187,41287,42197,no,yes,no,cellular,nov,fri,239,-1.1,...,no,1588291200,3,999,1,failure,74,retired,married,professional.course


In [81]:
result.count()

cust_id                   41188
campaign_id               41188
default                   41188
housing                   41188
loan                      41188
contact                   41188
month                     41188
day_of_week               41188
duration                  41188
emp.var.rate              41188
cons.price.idx            41188
cons.conf.idx             41188
euribor3m                 41188
nr.employed               41188
y                         41188
target_timestamp          41188
campaign_info.campaign    41188
campaign_info.pdays       41188
campaign_info.previous    41188
campaign_info.poutcome    41188
demographics.age          41188
demographics.job          41188
demographics.marital      41188
demographics.education    41188
dtype: int64

In [82]:
result.describe()

Unnamed: 0,cust_id,campaign_id,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target_timestamp,campaign_info.campaign,campaign_info.pdays,campaign_info.previous,demographics.age
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,20693.5,21603.5,258.28501,0.081886,93.575664,-40.5026,3.621291,5167.035911,1588291000.0,2.567593,962.475454,0.172963,40.02406
std,11890.09578,11890.09578,259.279249,1.57096,0.57884,4.628198,1.734447,72.251528,0.0,2.770014,186.910907,0.494901,10.42125
min,100.0,1010.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,1588291000.0,1.0,0.0,0.0,17.0
25%,10396.75,11306.75,102.0,-1.8,93.075,-42.7,1.344,5099.1,1588291000.0,1.0,999.0,0.0,32.0
50%,20693.5,21603.5,180.0,1.1,93.749,-41.8,4.857,5191.0,1588291000.0,2.0,999.0,0.0,38.0
75%,30990.25,31900.25,319.0,1.4,93.994,-36.4,4.961,5228.1,1588291000.0,3.0,999.0,0.0,47.0
max,41287.0,42197.0,4918.0,1.4,94.767,-26.9,5.045,5228.1,1588291000.0,56.0,999.0,7.0,98.0
