## First, testing our Azure endpoint

In [1]:
import requests

In [2]:
test_data = {
    'age': 45.0,
    'sex': 1.0,
    'chest_pain': 4.0,
    'blood_pressure': 115.0,
    'serum_cholestoral': 260.0,
    'fasting_blood_sugar': 0.0,
    'electrocardiographic': 2.0,
    'max_heart_rate': 185.0,
    'induced_angina': 0.0,
    'ST_depression': 0.0,
    'slope': 1.0,
    'vessels': 0.0,
    'thal': 3.0,
}

In [3]:
base_url = 'https://mytestmlappnjr.azurewebsites.net'

In [4]:
r = requests.get(base_url)

In [5]:
r.text

'App is Healthy'

In [6]:
neural_url = 'https://mytestmlappnjr.azurewebsites.net/neural'

In [7]:
r = requests.post(neural_url, json = test_data)

In [8]:
r.json()

0

## Data for Mini-Project

In [9]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)

%config Completer.use_jedi = False

https://www.kaggle.com/adammaus/predicting-churn-for-bank-customers?select=Churn_Modelling.csv

In [10]:
data_path = pathlib.Path('train.csv')
data = pd.read_csv(data_path)

In [11]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,597,Germany,Female,35,8,131101.04,1,1,1,192852.67,0
1,523,France,Female,40,2,102967.41,1,1,0,128702.1,1
2,706,Spain,Female,42,8,95386.82,1,1,1,75732.25,0
3,788,France,Male,32,4,112079.58,1,0,0,89368.59,0
4,706,Germany,Male,38,5,163034.82,2,1,1,135662.17,0


In [12]:
# data types look good
data.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [13]:
# no missing values
data.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [14]:
# check what variables to convert
for col in data.columns:
    if data[col].nunique() < 10:
        print(f'{col}: {data[col].unique()}')

Geography: ['Germany' 'France' 'Spain']
Gender: ['Female' 'Male']
NumOfProducts: [1 2 3 4]
HasCrCard: [1 0]
IsActiveMember: [1 0]
Exited: [0 1]


In [15]:
# make dummy variables for all cols to consider

cols_to_consider = ['Geography','Gender']

for col in cols_to_consider:
    data = pd.concat([data,pd.get_dummies(data[col])], axis = 1)
    data = data.drop(columns= [col])

In [16]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,Female,Male
0,597,35,8,131101.04,1,1,1,192852.67,0,0,1,0,1,0
1,523,40,2,102967.41,1,1,0,128702.1,1,1,0,0,1,0
2,706,42,8,95386.82,1,1,1,75732.25,0,0,0,1,1,0
3,788,32,4,112079.58,1,0,0,89368.59,0,1,0,0,0,1
4,706,38,5,163034.82,2,1,1,135662.17,0,0,1,0,0,1


In [17]:
target = 'Exited'
features = [col for col in data.columns if col!= target]
data.loc[0, features].to_dict()

{'CreditScore': 597.0,
 'Age': 35.0,
 'Tenure': 8.0,
 'Balance': 131101.04,
 'NumOfProducts': 1.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 1.0,
 'EstimatedSalary': 192852.67,
 'France': 0.0,
 'Germany': 1.0,
 'Spain': 0.0,
 'Female': 1.0,
 'Male': 0.0}

## Start Analysis:

In [18]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import seaborn as sns
sns.set_style('whitegrid')

from bokeh.layouts import gridplot, column
from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource, 
                          HoverTool, LabelSet, LinearColorMapper, NumeralTickFormatter)
from bokeh.palettes import brewer, RdBu, Reds
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import transform

%config Completer.use_jedi = False
output_notebook()

In [19]:
#check correlations

correlation_matrix = data.corr()
correlation_matrix = correlation_matrix.unstack().reset_index()
correlation_matrix.columns = ['Variable 1', 'Variable 2', 'Correlation']
variables = sorted(list(correlation_matrix['Variable 1'].unique()))

source = ColumnDataSource(correlation_matrix)

palette = brewer['RdBu'][10]
color_mapper = LinearColorMapper(
    palette = palette, 
    low = -1, 
    high = 1.0,
)

p = figure(
    plot_width = 550, 
    plot_height = 400, 
    title = f'Correlation Matrix',
    x_range = variables, 
    y_range = list(reversed(variables)),
    tools = 'hover', 
    x_axis_location="below",
)

p.rect(
    x = 'Variable 2', 
    y = 'Variable 1', 
    width = 1, 
    height = 1, 
    source = source,
    line_color = 'grey', 
    fill_color = transform('Correlation', color_mapper),
)

color_bar = ColorBar(
    color_mapper = color_mapper, 
    location = (0, 0),
    ticker = BasicTicker(desired_num_ticks = len(palette)),
)
color_bar.formatter = NumeralTickFormatter(format="0.0%")

p.add_layout(color_bar, 'right')

hover = p.hover.tooltips = [
    ("Variable 1", "@{Variable 1}"),
    ("Variable 2", "@{Variable 2}"),
    ("Correlation", "@Correlation{0.2f%}"),
]

p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "12px"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 1.0

color_bar.label_standoff = 4
color_bar.major_label_text_align = 'left'
color_bar.major_label_text_font_size = '12px'

show(p)

In [20]:
# check statistics

data.describe(percentiles = [0.5]).transpose()

Unnamed: 0,count,mean,std,min,50%,max
CreditScore,9970.0,650.5796,96.6353,350.0,652.0,850.0
Age,9970.0,38.9257,10.4905,18.0,37.0,92.0
Tenure,9970.0,5.0135,2.8918,0.0,5.0,10.0
Balance,9970.0,76485.4634,62400.275,0.0,97221.52,250898.09
NumOfProducts,9970.0,1.53,0.5817,1.0,1.0,4.0
HasCrCard,9970.0,0.7054,0.4559,0.0,1.0,1.0
IsActiveMember,9970.0,0.515,0.4998,0.0,1.0,1.0
EstimatedSalary,9970.0,100069.8759,57510.557,11.58,100168.24,199992.48
Exited,9970.0,0.2038,0.4029,0.0,0.0,1.0
France,9970.0,0.5016,0.5,0.0,1.0,1.0


In [21]:
# scale the data

from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit(data[features])
scaled_data = scaler.fit_transform(data[features])
scaled_data = pd.DataFrame(scaled_data, columns = features)
scaled_data[target] = data[target]

In [22]:
# look at scaled data

scaled_data.describe(percentiles = [0.5]).transpose()

Unnamed: 0,count,mean,std,min,50%,max
CreditScore,9970.0,0.0,1.0001,-3.1106,0.0147,2.0637
Age,9970.0,0.0,1.0001,-1.9948,-0.1836,5.0595
Tenure,9970.0,-0.0,1.0001,-1.7338,-0.0047,1.7245
Balance,9970.0,0.0,1.0001,-1.2258,0.3323,2.7952
NumOfProducts,9970.0,-0.0,1.0001,-0.9111,-0.9111,4.2462
HasCrCard,9970.0,-0.0,1.0001,-1.5475,0.6462,0.6462
IsActiveMember,9970.0,0.0,1.0001,-1.0306,0.9703,0.9703
EstimatedSalary,9970.0,-0.0,1.0001,-1.7399,0.0017,1.7376
France,9970.0,0.0,1.0001,-1.0032,0.9968,0.9968
Germany,9970.0,-0.0,1.0001,-0.5788,-0.5788,1.7277


In [23]:
# check distribution of target variable

pd.value_counts(scaled_data[target])/pd.value_counts(scaled_data[target]).sum()

0   0.7962
1   0.2038
Name: Exited, dtype: float64

In [24]:
# representative split

from sklearn.model_selection import train_test_split

for i in range(1000):
    train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = i)
    temp = pd.value_counts(train[target])/pd.value_counts(train[target]).sum()
    if temp[0] < temp[1]:
        print(i)

In [25]:
# check to make sure

train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 809)
pd.value_counts(train[target])/pd.value_counts(train[target]).sum()

0   0.7962
1   0.2038
Name: Exited, dtype: float64

In [26]:
# create test/train objects

train, test = train_test_split(scaled_data, stratify = scaled_data[target], random_state = 809)
x_train, y_train = train[features], train[target]
x_test, y_test = test[features], test[target]

In [27]:
from sklearn.linear_model import LogisticRegression

## Logistic Regression

In [28]:
# clf = LogisticRegression(random_state=0, solver = 'newton-cg')

# clf = clf.fit(x_train, y_train)

# clf.score(x_test, y_test)

In [29]:
def plot_confusion_matrix(y_true, y_predicted):
    
    from sklearn import metrics
    
    accuracy = np.round(100*(y_true == y_predicted).astype(int).sum()/len(y_predicted), 2)
    
    confusion = pd.DataFrame(metrics.confusion_matrix(y_true, y_predicted))
    confusion.index.name = "True"
    confusion.columns.name = "Predicted"
    confusion = confusion.stack().rename("value").reset_index()
    confusion['True'] = confusion['True'].astype(str)
    confusion['Predicted'] = confusion['Predicted'].astype(str)

    source = ColumnDataSource(confusion)

    values = sorted(list(confusion['True'].unique()))

    palette = brewer['RdBu'][10]
    color_mapper = LinearColorMapper(
        palette = palette, 
    )

    p = figure(
        plot_width = 300, 
        plot_height = 300, 
        title = f'Confusion Matrix: Overall accuracy = {accuracy}%',
        x_range = ['0', '1'], 
        y_range = ['0', '1'],
        x_axis_label = 'Predicted',
        y_axis_label = 'True',
        tools = 'hover', 
        x_axis_location="below",
    )

    p.rect(
        x = 'Predicted', 
        y = 'True', 
        width = 1, 
        height = 1, 
        source = source,
        line_color = 'grey', 
        fill_color = transform('value', color_mapper),
    )
    
    hover = p.hover.tooltips = [
        ("True", "@{True}"),
        ("Predicted", "@{Predicted}"),
        ("Count", "@value"),
    ]

    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "14px"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0
    
    labels = LabelSet(x='Predicted', y='True', text='value',
                      render_mode='canvas', text_color = 'white',
                      x_offset = 50, y_offset = 50, source=source,)

    p.add_layout(labels)
    
    show(p)

In [30]:
# predictions = clf.predict(x_test)
# plot_confusion_matrix(y_test, predictions)

## Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
# clf = RandomForestClassifier(random_state = 0)

# clf = clf.fit(x_train, y_train)

# clf.score(x_test, y_test)

In [33]:
# y_hat = clf.predict(x_test)
# plot_confusion_matrix(y_test, y_hat)

## Gradient Boosted Tree

In [34]:
from sklearn.ensemble import GradientBoostingClassifier

In [35]:
# clf = GradientBoostingClassifier(random_state = 0)

# clf = clf.fit(x_train, y_train)

# clf.score(x_test, y_test)

In [36]:
# predictions = clf.predict(x_test)
# plot_confusion_matrix(y_test, predictions)

## AdaBoost

In [37]:
from sklearn.ensemble import AdaBoostClassifier

# clf = AdaBoostClassifier(random_state = 0)

# clf = clf.fit(x_train, y_train)

# clf.score(x_test, y_test)

In [38]:
# predictions = clf.predict(x_test)
# plot_confusion_matrix(y_test, predictions)

## Hyperparameter Tuning

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
# params = {
#     'n_estimators': [5, 10, 50, 100],
#     'learning_rate': [0.001, 0.01, 0.1, 1, 10],
# }

In [41]:
# adaboost = AdaBoostClassifier(random_state = 0)

# clf = GridSearchCV(adaboost, params, error_score=0)
# search = clf.fit(x_train, y_train)
# best_params = search.best_params_
# best_params

In [42]:
# clf = AdaBoostClassifier(random_state = 0, **best_params)
# clf = clf.fit(x_train, y_train)
# clf.score(x_test, y_test)

In [43]:
# predictions = clf.predict(x_test)
# plot_confusion_matrix(y_test, predictions)

In [44]:
import statsmodels.formula.api as smf

# summary = pd.DataFrame(search.cv_results_)
# param_columns = [col for col in summary.columns if col.startswith('param') and (col != 'params')]

# metric_col = 'mean_test_score'
# summary = summary[param_columns + [metric_col]]
# summary = summary.dropna()
# for col in summary.columns:
#     summary[col] = pd.to_numeric(summary[col])

# formula = f"{metric_col} ~ {'*'.join(param_columns)}"

# model = smf.ols(
#     formula = formula, 
#     data = summary)

# fit_model = model.fit()

# fit_model.summary()

In [45]:
# run_cell = True
# int_step = 2
# float_delta = 0.1
# float_steps = 4

# if run_cell:
#     params = {
#         'n_estimators': [5, 10, 50, 100],
#         'learning_rate': [0.001, 0.01, 0.1, 1, 10],
#     }

#     adaboost = AdaBoostClassifier(random_state = 0)

#     print('Starting course search')
#     clf = GridSearchCV(adaboost, params)
#     search = clf.fit(x_train, y_train)
#     print(f'Best params from course search: {search.best_params_}')

#     fine_params = {}
#     for param in params:
#         if isinstance(search.best_params_[param], int):
#             min_val = search.best_params_[param] - int_step
#             max_val = search.best_params_[param] + int_step + 1
#             fine_params[param] = [i for i in range(min_val, max_val)]
#         else:
#             min_val = search.best_params_[param]*(1 - float_delta)
#             max_val = search.best_params_[param]*(1 + float_delta)
#             fine_params[param] = np.linspace(min_val, max_val, float_steps)

#     print('Starting fine search')
#     clf = GridSearchCV(adaboost, fine_params, error_score=0)
#     search = clf.fit(x_train, y_train)
#     print(f'Best params from fine search: {search.best_params_}')

#     clf = AdaBoostClassifier(random_state = 0, **search.best_params_)
#     clf = clf.fit(x_train, y_train)
#     clf.score(x_test, y_test)

#     predictions = clf.predict(x_test)
#     plot_confusion_matrix(y_test, predictions)

## Neural Networks

In [46]:
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input

In [47]:
def plot_history(tf_history):

    width = 12
    height = width*0.4
    fig, ax = plt.subplots(1, 2, figsize = (width, height))

    index = [i for i, _ in enumerate(history.history['loss'], 1)]

    ax[0].plot(index, tf_history.history['loss'], label = 'Loss')
    ax[0].plot(index, tf_history.history['val_loss'], label = 'Validation Loss')
    ax[0].legend(loc = 0)
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Value')

    ax[1].plot(index, tf_history.history['accuracy'], label = 'Accuracy')
    ax[1].plot(index, tf_history.history['val_accuracy'], label = 'Validation Accuracy')
    ax[1].legend(loc = 0)
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Value')
    plt.show()

In [48]:
#epoch

# tf.random.set_seed(0)

# model = Sequential()
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss = 'binary_crossentropy', 
#               optimizer = 'adam', 
#               metrics = ['accuracy'])

# X, y = train[features].values, train[target].values
# history = model.fit(X, y, 
#           epochs = 100, 
#           batch_size = 10, 
#           verbose = 0,
#           validation_split = 0.2);

# plot_history(history)

In [49]:
# predictions = (model.predict(test[features].values).flatten() > 0.5).astype(int)

# plot_confusion_matrix(y_test, predictions)

In [50]:
# deep neural net with dropout

# tf.random.set_seed(0)

# model = Sequential()
# model.add(Dense(9, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(6, activation='relu'))
# model.add(Dropout(0.1))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss = 'binary_crossentropy', 
#               optimizer = 'adam', 
#               metrics = ['accuracy'])

# X, y = train[features].values, train[target].values
# history = model.fit(X, y, 
#           epochs = 100, 
#           batch_size = 10, 
#           verbose = 0,
#           validation_split = 0.2);

# plot_history(history)

In [51]:
# predictions = (model.predict(test[features].values).flatten() > 0.5).astype(int)

# plot_confusion_matrix(y_test, predictions)

## Final Model

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)

clf = clf.fit(x_train, y_train)

clf.score(x_test, y_test)

0.8592057761732852

In [53]:
predictions = clf.predict(x_test)
plot_confusion_matrix(y_test, predictions)

In [87]:
import json
import pickle

In [88]:
with open('NickRoyModel.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [82]:
NickRoy = 'https://mytestmlappnjr.azurewebsites.net/NickRoy'

In [83]:
index = 100
data_dict = data.loc[index, features].to_dict()

In [84]:
data_dict

{'CreditScore': 639.0,
 'Age': 22.0,
 'Tenure': 4.0,
 'Balance': 0.0,
 'NumOfProducts': 2.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 0.0,
 'EstimatedSalary': 28188.96,
 'France': 1.0,
 'Germany': 0.0,
 'Spain': 0.0,
 'Female': 0.0,
 'Male': 1.0}

In [85]:
response = requests.post(NickRoy, json = data_dict)

In [86]:
response.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [70]:
with open('scaler_means.json') as fin:
    temp = json.load(fin)

In [71]:
temp

{'CreditScore': 639.0,
 'Age': 22.0,
 'Tenure': 4.0,
 'Balance': 0.0,
 'NumOfProducts': 2.0,
 'HasCrCard': 1.0,
 'IsActiveMember': 0.0,
 'EstimatedSalary': 28188.96,
 'France': 1.0,
 'Germany': 0.0,
 'Spain': 0.0,
 'Female': 0.0,
 'Male': 1.0}