# Capstone - Optimizing Bank Marketing Campaigns with Machine Learning

### *Importing Libraries and Data*

In [27]:
### Importing relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')
%matplotlib inline

In [28]:
### Importing the combined data set

df = pd.read_csv('Data/bank-additional-combined.csv')

### *Data Cleaning Functions (making clean_data)*

In [29]:
### Casting job, martial, education, default, housing, loan, contact, month, day_of_week, poutcome, and y to categorical variables

columns_to_categorize = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']

def cast_as_columns(df):
    for column in columns_to_categorize:
        df[column] = df[column].astype('category')
    return df

### Writing a function that provides ordinal encoding for education, in order of: unknown, illiterate, basic.4y, basic.6y, basic.9y, high.school, professional.course, university.degree

def ordinal_encode_education(df):
    df['education'] = df['education'].cat.reorder_categories(['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree'])
    df['education'] = df['education'].cat.codes
    return df

### Writing a function that drops duration, as it is not known before a call is made

def drop_duration(df):
    df = df.drop('duration', axis=1)
    return df

### Writing a function that encodes the target variable, y, as 0 for no and 1 for yes

def encode_target(df):
    df['y'] = df['y'].cat.codes
    return df

### Writing a function that combines the above functions

def clean_data(df):
    df = cast_as_columns(df)
    df = ordinal_encode_education(df)
    df = drop_duration(df)
    df = encode_target(df)
    return df

### Writing a function that one-hot encodes the categorical variables

def one_hot_encode(df):
    df_encoded = df.drop('y', axis=1)
    df_encoded = pd.get_dummies(df_encoded, drop_first=True)
    df_encoded['y'] = df['y']
    df = df_encoded
    return df

In [31]:
### Applying the above functions to the data set

df = clean_data(df)
df = one_hot_encode(df)

In [32]:
### Making a function for model evaluation

def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Takes in training and testing data and a model and prints out the accuracy, 
    f1, recall, and precision scores for both the training and testing data.
    """
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print('Training Accuracy Score: ', accuracy_score(y_train, y_train_pred))
    print('Training F1 Score: ', f1_score(y_train, y_train_pred))
    print('Training Recall Score: ', recall_score(y_train, y_train_pred))
    print('Training Precision Score: ', precision_score(y_train, y_train_pred))
    print('Test Accuracy Score: ', accuracy_score(y_test, y_test_pred))
    print('Test F1 Score: ', f1_score(y_test, y_test_pred))
    print('Test Recall Score: ', recall_score(y_test, y_test_pred))
    print('Test Precision Score: ', precision_score(y_test, y_test_pred))

### *Performing a train-test split*

In [33]:
### Performing a train-test split

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### *Modeling: Random Forest*

In [40]:
### Making a random forest model

rf = RandomForestClassifier(max_depth=50, n_estimators=100, random_state=42)

### Evaluating the random forest model

evaluate_model(rf, X_train, y_train, X_test, y_test)

Training Accuracy Score:  0.9949970570924073
Training F1 Score:  0.977536997885835
Training Recall Score:  0.9655442443226312
Training Precision Score:  0.9898314155739898
Test Accuracy Score:  0.9087136929460581
Test F1 Score:  0.4951171875000001
Test Recall Score:  0.4023809523809524
Test Precision Score:  0.6434010152284264


In [37]:
### Performing a grid search tuned to the f1 score

params = {
    'n_estimators': [300, 400, 500],
    'max_depth': [30, 35, 40, 45, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

gs = GridSearchCV(rf, param_grid=params, scoring='f1', cv=5, verbose=1, n_jobs=-1)

gs.fit(X_train, y_train)

### Printing the best parameters

gs.best_params_

### Printing the best score

gs.best_score_

Fitting 5 folds for each of 135 candidates, totalling 675 fits


0.46490659236880705

In [38]:
gs.best_params_

{'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 500}