### Data Preparation

Objective: prepare original dataset for --insert notebook names here--
        - Label encode categorical features
        - Remove/impute any nulls/unknowns todo
        - Add timestamp as index
        - Add calendar features

In [154]:
import os
import pandas as pd
import numpy as np
import calendar
import holidays
from datetime import date

In [155]:
df = pd.read_csv('data/bank-additional-full.csv', sep=';')

Time Series Features

In [156]:
#Creating df with euribor3m rows where a change took place (likely indicating a new day)
res = df.loc[df['euribor3m'].shift(-1) != df['euribor3m']]

In [157]:
#Data quality issue: removing this row from the res df because corresponding value for official euribor rate does not exist:
res.loc[res.euribor3m == 0.835].index
res = res.drop(40132, axis = 0)

In [158]:
#Data quality issue: removing this row from the res df because corresponding value for official euribor rate does not exist:
res.loc[res.euribor3m == 1.05].index
res = res.drop(41099, axis = 0)

In [159]:
euri_df = pd.read_csv('data/daily_euribor3m.csv')

In [160]:
euri_df['year'] = [i[:4] for i in euri_df['Date']]
euri_df['month'] = [i[5:7] for i in euri_df['Date']]
euri_df['day'] = [i[8:10] for i in euri_df['Date']]

In [161]:
euri_df['Date'] = pd.to_datetime(euri_df.Date)

In [162]:
#Filtering for dates included in original dataset (per original article: http://media.salford-systems.com/video/tutorial/2015/targeted_marketing.pdf)
euri_df = euri_df[(euri_df.Date < '2010-11-30') & (euri_df.Date > '2008-05-01')].sort_index(ascending=False)

In [163]:
#Useful question: how many times within the time period was the euribor the same among sequential days?
print("Number of times the euribor was the same on at least two consecutive days:")
len(euri_df[euri_df['Value'].shift(-1) == euri_df['Value']])

Number of times the euribor was the same on at least two consecutive days:


81

In [164]:
#Visual Check:
#times = []
#for eu, (i, val) in zip(res['euribor3m'], enumerate(euri_df['Value'])):
    #if eu == val:
    #print(eu,i,val) 

In [165]:
idx_i, idx_k = [],[]

for index_i, i in enumerate(res['euribor3m']):
    for index_k, k in enumerate(euri_df['Value']):       
        if i == k:
            #print(i,k, index_i, index_k)
            idx_i.append(index_i)
            idx_k.append(index_k)

In [166]:
lst_i = [0]
lst_k = [1]
for i, k in list(zip(idx_i[1:], idx_k[1:])):
    if k not in lst_k:
        if i not in lst_i:
            if i < k & k > max(lst_k):
                #print(i,k)
                lst_i.append(i)
                lst_k.append(k)

In [167]:
#lst_k is the index for dates that need to be applied to the original dataset:
#resetting the index ahead of merging with original dataframe:
euri_df.reset_index(drop=1, inplace=True)

In [168]:
euribor = euri_df.copy()

In [169]:
euribor = euribor.iloc[lst_k,:]

In [170]:
#Final check:
#for i,(num,k) in zip(res['euribor3m'], enumerate(euribor['Value'])): 
#    print(i,num,k)

In [171]:
#euribor.head()
#res.head()
res['temp'] = [i for i in range(len(euribor))]
euribor['temp'] = [i for i in range(len(euribor))]

In [172]:
res2 = res.merge(euribor, how='inner', on='temp')
res2.index = res.index

In [173]:
#res2.head()
res2 = res2[['Date', 'year', 'month_y', 'day']]

In [174]:
#res2.index

In [175]:
data_wtime = res2.merge(df, how='right', right_index=True, left_index=True)

In [176]:
#data_wtime[data_wtime.Date.isna() == False]

In [177]:
data_wtime.Date.fillna(method='bfill', inplace=True)

In [178]:
data_wtime.rename(columns={'month':'month_last_contact', 
                          'day_of_week':'day_of_week_last_contact'},
                 inplace=True)

data_wtime.rename(columns={'month_y':'month'},
                 inplace=True)

In [179]:
#adding values for day, month and year to the complete dataset:
data_wtime['day'] = [i.day for i in data_wtime.Date]
data_wtime['month'] = [i.month for i in data_wtime.Date]
data_wtime['year'] = [i.year for i in data_wtime.Date]

In [180]:
#data_wtime.shape

In [181]:
#renaming df:
final_df = data_wtime.copy()

Calendar Features

In [183]:
#Creating day of the week feature
final_df['weekday'] = final_df.Date.dt.weekday

In [184]:
#Creating total number of days in month
lst = []
for i,k in zip(final_df['year'], final_df['month']):
    lst.append(calendar.monthrange(i, k)[1])
final_df['month_total_days'] = lst

In [185]:
#Creating is holiday flag
final_df['is_holiday'] = [1 if i in holidays.Portugal() else 0 for i in final_df['Date']]

In [186]:
#Also adding holiday names
hol_names = []
for i in final_df['Date']:
    if i in holidays.Portugal():
        hol_names.append(holidays.Portugal().get(i))
    else:
        hol_names.append("No Holiday")
final_df['holiday_name'] = hol_names

In [187]:
#According to research, most government employees are paid ont he 20th 
#while private industry employees are paid on the 30th
final_df['biz_payday'] = [1 if i == 20 else 0 for i in final_df['day']]
final_df['govt_payday'] = [1 if i == 30 else 1 if i == 29 else 1 if i == 28 else 0 for i in final_df['day']]

In [188]:
final_df.shape

(41188, 31)

Encoding Categorical Features

In [189]:
final_df.dtypes

Date                        datetime64[ns]
year                                 int64
month                                int64
day                                  int64
age                                  int64
job                                 object
marital                             object
education                           object
default                             object
housing                             object
loan                                object
contact                             object
month_last_contact                  object
day_of_week_last_contact            object
duration                             int64
campaign                             int64
pdays                                int64
previous                             int64
poutcome                            object
emp.var.rate                       float64
cons.price.idx                     float64
cons.conf.idx                      float64
euribor3m                          float64
nr.employed

In [190]:
cat_cols = final_df.select_dtypes(include='object').columns

In [191]:
for col in cat_cols:
    final_df[col] = final_df[col].astype('category').cat.codes

In [146]:
#df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('category').cat.codes,axis=1)

In [192]:
final_df.dtypes

Date                        datetime64[ns]
year                                 int64
month                                int64
day                                  int64
age                                  int64
job                                   int8
marital                               int8
education                             int8
default                               int8
housing                               int8
loan                                  int8
contact                               int8
month_last_contact                    int8
day_of_week_last_contact              int8
duration                             int64
campaign                             int64
pdays                                int64
previous                             int64
poutcome                              int8
emp.var.rate                       float64
cons.price.idx                     float64
cons.conf.idx                      float64
euribor3m                          float64
nr.employed

In [195]:
final_df.set_index('Date',inplace=True)

In [197]:
final_df.to_pickle('data/data_ready.pickle')