<img src='img/bank.png' width='100'>

# Information
<div style="background-color:lightgrey">
The data is related with direct marketing campaigns of a Portuguese banking institution. 
The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, 
in order to access if the product (bank term deposit) would be (or not) subscribed.
</div>

Input variables:

## bank client data:
   <div>
    <ol>
       <li><strong>age (numeric)</strong> </li>
       <li><strong>job</strong> : type of job(categorical:"admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services")</li> 
       <li><strong>marital</strong> : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
       <li><strong>education</strong> (categorical: "unknown","secondary","primary","tertiary")</li>
       <li><strong>default</strong>: has credit in default? (binary: "yes","no")</li>
       <li><strong>balance</strong>: average yearly balance, in euros (numeric) </li>
       <li><strong>housing</strong>: has housing loan? (binary: "yes","no")</li>
       <li><strong>loan</strong>: has personal loan? (binary: "yes","no")</li>
    </ol>
   </div>
   
   ## related with the last contact of the current campaign:
   <ol>
      <li><strong>contact</strong>: contact communication type (categorical: "unknown","telephone","cellular")</li> 
      <li><strong>day</strong>: last contact day of the month (numeric)</li>
      <li><strong>month</strong>: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")</li>
      <li><strong>duration</strong>: last contact duration, in seconds (numeric)</li>
    </ol>
    
   ## other attributes:
   
   <ol>
      <li><strong>campaign</strong>: number of contacts performed during this campaign and for this client (numeric, includes last contact)</li>
      <li><strong>pdays</strong>: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)</li>
      <li><strong>previous</strong>: number of contacts performed before this campaign and for this client (numeric)</li>
      <li><strong>poutcome</strong>: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")</li>
</ol>
  Output variable (desired target):
  <ol>
      <li><strong>y</strong> - has the client subscribed a term deposit? (binary: "yes","no")</li>
  </ol>


In [None]:
import pandas as pd

#loading data
df = pd.read_csv('bank.csv', sep=';')
df

In [None]:
#categorical columns. They will be OneHotEncoded
cat_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

#dataframe with one hot encoded categorical columns
df_ohe = pd.get_dummies(df, columns = cat_cols)

#encoding target variable
df_ohe['y'].replace({'no':0,'yes':1}, inplace=True)

#unencoder
unenc = {0:'no',1:'yes'}

df_ohe.head(3)

In [None]:
#separating features from target variable
X = df_ohe.drop(['y'], axis=1)
y = df_ohe['y']

In [None]:
from sklearn.preprocessing import MinMaxScaler

#normalizing data
normalizer = MinMaxScaler(feature_range=(0,1)).fit(X)
X_norm = normalizer.transform(X)
X_norm

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

#list of models being tested
models = []
m1 = ('GNB',GaussianNB())
m2 = ('LR',LogisticRegression()) # C=2.9 l1_ratio=0.6 score=0.9029
m3 = ('DTC',DecisionTreeClassifier())
m4 = ('KNN',KNeighborsClassifier())
m5 = ('SVM',SVC())
m6 = ('RFC',RandomForestClassifier())

#appending models to the model's list
models.append(m1)
models.append(m2)
models.append(m3)
models.append(m4)
models.append(m5)
models.append(m6)

#dict in the format {model_name:accuracy}
results = {}

#test each model accuracy
for model in models:
    kfold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
    model_name = model[0]
    model_selected = model[1]    
    result = cross_val_score(model_selected,X_norm,y,cv=kfold, scoring='accuracy')
    result = result.mean() #mean of all folds results
    results[model_name] = result

#selecting the model that best performs
sorted_models = {k:v for k,v in sorted(results.items(), key = lambda item: item[1], reverse=True)}
sorted_models

In [None]:
#PS:
#If 'LR' (LinearRegression) was the best model above, it continues to execute the remaining codes below.
#If not, it will stop with error

import numpy as np

# #choosing the best model
# for name,model in models:
#     model_name,model = name,model
#     best_model_name = list(sorted_models.keys())[0]
#     if(model_name == best_model_name):
#         print('The best model for this problem was {}'.format(model_name))
#         choosen_model = model

choosen_model = LogisticRegression()

#the following are parameter values for LR
#running before, it was known that C=2.9 and l1_ratio=0.6 delivered the best result
param_values = {
    'C':[2.9],
    'penalty':['elasticnet'],
    'solver':['saga'], 
    'max_iter':[300],
    'l1_ratio':[0.6]}

search = GridSearchCV(estimator=choosen_model, param_grid=param_values,cv=5,verbose=0)
search.fit(X_norm,y)

#an example of data to be predicted (only testing model's prediction)
sample_data = X.iloc[0,:] #got the first line of the dataframe to avoid typing a lot of data
sample_data_normalized = normalizer.transform(np.array(sample_data).reshape(1,-1))

#predicted result
prediction = search.predict(sample_data_normalized)[0]

# print('The predicted result for\n{}\nis: {}\n'.format(X.iloc[0,:],prediction))
print('The predicted result for\n{}\nis: {}\n'.format(X.iloc[0,:],unenc[prediction]))
print('Best score: ',search.best_score_)
print('Best C: ',search.best_estimator_.C)
print('Best l1_ratio: ',search.best_estimator_.l1_ratio)

In [None]:
import pickle

# save the model to disk
pickle.dump(search, open('LR_saved', 'wb'))

In [1]:
#The same algorithm as above, but with graphical interface with PySimpleGUI
import PySimpleGUI as sg
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

#loading data
df = pd.read_csv('bank.csv', sep=';')

#categorical columns
cat_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

#encoding target variable
df['y'].replace({'no':0,'yes':1}, inplace=True)

sg.theme('DarkAmber')   # Add a touch of color

# All the stuff inside the program.
layout = [  
            [sg.T('Source Folder')],
            [
                sg.Text('age'),sg.Input(s=15,k='-AGE-'),
                sg.Text('job'),sg.Combo(list(df['job'].unique()),default_value='unemployed', s=(15,22), enable_events=True, readonly=True, k='-JOB-')
            ],
            [
                sg.Text('marital'),sg.Combo(list(df['marital'].unique()),default_value='married', s=(15,22), enable_events=True, readonly=True, k='-MARITAL-'),
                sg.Text('education'),sg.Combo(list(df['education'].unique()),default_value='primary', s=(15,22), enable_events=True, readonly=True, k='-EDUCATION-')
            ],
            [
                sg.Text('default'),sg.Combo(list(df['default'].unique()),default_value='no', s=(15,22), enable_events=True, readonly=True, k='-DEFAULT-'),
                sg.Text('balance'),sg.Input(s=15,k='-BALANCE-')
            ],
            [
                sg.Text('housing'),sg.Combo(list(df['housing'].unique()),default_value='yes', s=(15,22), enable_events=True, readonly=True, k='-HOUSING-'),
                sg.Text('loan'),sg.Combo(list(df['loan'].unique()),default_value='no', s=(15,22), enable_events=True, readonly=True, k='-LOAN-')
            ],
            [
                sg.Text('contact'),sg.Combo(list(df['contact'].unique()),default_value='cellular', s=(15,22), enable_events=True, readonly=True, k='-CONTACT-'),
                sg.Text('day'),sg.Input(s=15,k='-DAY-')
            ],
            [
                sg.Text('month'),sg.Combo(list(df['month'].unique()),default_value='apr', s=(15,22), enable_events=True, readonly=True, k='-MONTH-'),
                sg.Text('duration'),sg.Input(s=15,k='-DURATION-')
            ],
            [
                sg.Text('campaign'),sg.Input(s=15,k='-CAMPAIGN-'),
                sg.Text('pdays'),sg.Input(s=15,k='-PDAYS-')
            ],
            [
                sg.Text('previous'),sg.Input(s=15,k='-PREVIOUS-'),
                sg.Text('poutcome'),sg.Combo(list(df['poutcome'].unique()),default_value='success', s=(15,22), enable_events=True, readonly=True, k='-POUTCOME-')
            ],
            [sg.In(key='-CHOOSE-')],
            [sg.FileBrowse(target=(-1, 0)), sg.OK(),sg.Text(k='-RESULT-')]   
         ]

# Create the Window
window = sg.Window('Previsão', layout)

# Event Loop to process "events" and get the "values" of the inputs
while True:
    event, values = window.read()
    if event == sg.WIN_CLOSED or event == 'Cancel': # if user closes window or clicks cancel
        break
        
    if event == 'OK':
        #loads the model
        #model_file_path = 'LR_saved'
        model_file_path = f'{values["-CHOOSE-"]}'
        try:
            model = pickle.load(open(model_file_path, 'rb'))
        except:
            window.Element('-RESULT-').update('Choose a model!')
            sg.popup_ok('Choose a model file!')
            break
            
        if(
            values['-AGE-'] == '' or
            values['-JOB-'] == '' or
            values['-MARITAL-'] == '' or
            values['-EDUCATION-'] == '' or
            values['-DEFAULT-'] == '' or
            values['-BALANCE-'] == '' or
            values['-HOUSING-'] == '' or
            values['-LOAN-'] == '' or
            values['-CONTACT-'] == '' or
            values['-DAY-'] == '' or
            values['-MONTH-'] == '' or
            values['-DURATION-'] == '' or
            values['-CAMPAIGN-'] == '' or
            values['-PDAYS-'] == '' or
            values['-PREVIOUS-'] == '' or
            values['-POUTCOME-'] == ''
        ):
            window.Element('-RESULT-').update('Fill all fields')
            sg.popup_ok('Fill all fields!')
            break
        

        #data entered in GUI to be predicted
        data_to_predict = {'age':values['-AGE-'],
                       'job':values['-JOB-'],
                       'marital':values['-MARITAL-'],
                       'education':values['-EDUCATION-'],
                       'default':values['-DEFAULT-'],
                       'balance':values['-BALANCE-'],
                       'housing':values['-HOUSING-'],
                       'loan':values['-LOAN-'],
                       'contact':values['-CONTACT-'],
                       'day':values['-DAY-'],
                       'month':values['-MONTH-'],
                       'duration':values['-DURATION-'],
                       'campaign':values['-CAMPAIGN-'],
                       'pdays':values['-PDAYS-'],
                       'previous':values['-PREVIOUS-'],
                       'poutcome':values['-POUTCOME-']
                      }
        
        #The last row will be the data to be predicted
        df = df.append(data_to_predict, ignore_index=True)
        print(df)

        #dataframe with one hot encoded categorical columns
        df_ohe = pd.get_dummies(df, columns = cat_cols)

        #separating features from target variable
        X = df_ohe.drop(['y'], axis=1)
        print('X.shape: ',X.shape)
        y = df_ohe['y']

        #normalizing data
        normalizer = MinMaxScaler(feature_range=(0,1)).fit(X)
        
        #choosing the last row, that is the data to be predicted
        sample_data = X.iloc[-1,:] 
        
        #normalizing data to be predicted
        sample_data_normalized = normalizer.transform(np.array(sample_data).reshape(1,-1))
        
        #predicted result
        prediction = model.predict(sample_data_normalized)[0]

        #unencoder
        unenc = {0:'no',1:'yes'}
        
        if unenc[prediction] == 'yes':
            result = 'YES, will subscribe a term deposit'
        if unenc[prediction] == 'no':
            result = 'NO, will not subscribe a term deposit'

        window.Element('-RESULT-').update(result)
        print('Prediction for:\n', data_to_predict, 'is: ',unenc[prediction])

window.close()

     age            job   marital  education default balance housing loan  \
0     30     unemployed   married    primary      no    1787      no   no   
1     33       services   married  secondary      no    4789     yes  yes   
2     35     management    single   tertiary      no    1350     yes   no   
3     30     management   married   tertiary      no    1476     yes  yes   
4     59    blue-collar   married  secondary      no       0     yes   no   
...   ..            ...       ...        ...     ...     ...     ...  ...   
4517  57  self-employed   married   tertiary     yes   -3313     yes  yes   
4518  57     technician   married  secondary      no     295      no   no   
4519  28    blue-collar   married  secondary      no    1137      no   no   
4520  44   entrepreneur    single   tertiary      no    1136     yes  yes   
4521  68        retired  divorced  secondary      no    4189      no   no   

        contact day month duration campaign pdays previous poutcome    y  
