In [1]:
# Standard library imports
import joblib
import json
import pathlib
import warnings
warnings.filterwarnings("ignore")

# Third-party library imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Read data
data_filepath = pathlib.Path('train.csv')
data = pd.read_csv(data_filepath)

# Create "dummy" columns for categorical data
dummy_column_mapper = {}
for col in data.columns:
    if data[col].dtype == 'object':
        temp = pd.get_dummies(data[col], prefix=col, drop_first=True)
        data = data.drop(columns=[col])
        data[temp.columns] = temp
        dummy_column_mapper[col] = temp.columns.tolist()

# Save mapper for dummy columns
with open('dummy_column_mapper.json', 'w') as fout:
    json.dump(dummy_column_mapper, fout)

# Prepare data for model training
target = 'Exited'
features = [col for col in data.columns if col != target]
binary_columns = [col for col in features if sorted(data[col].unique().tolist()) == [0, 1]]

X = data[features].copy()
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    random_state=42,
)

# Save column order of training data
with open('col_order.json', 'w') as fout:
    json.dump(X_train.columns.tolist(), fout)

# Fit scaler
scaler = StandardScaler()
scaler = scaler.fit(X_train)

# Save scaling information
scaler_filepath = pathlib.Path('scaler_info.json')

scaler_dict  = {}
for feature, mean, scale in zip(features, scaler.mean_, scaler.scale_):
    if feature in binary_columns:
        scaler_dict[feature] = {
            'mean': 0,
            'std': 1,
        }
    else:
        scaler_dict[feature] = {
            'mean': mean,
            'std': scale,
        }
        
with open(scaler_filepath, 'w') as fout:
    json.dump(scaler_dict, fout)
    
# Scale data
for col, col_params in scaler_dict.items():
    X_train.loc[:, col] = (X_train.loc[:, col] - col_params['mean'])/col_params['std']
    X_test.loc[:, col] = (X_test.loc[:, col] - col_params['mean'])/col_params['std']

# # Fit random forest model
# params = {
#     'criterion': ['gini', 'entropy'], 
#     'max_depth': [5, 10, 15], 
#     'n_estimators': [100, 300, 500], 
# }

# clf = GridSearchCV(RandomForestClassifier(random_state=0), params, error_score=0)
# search = clf.fit(X_train, y_train)
# best_params = search.best_params_ 

# clf = RandomForestClassifier(random_state=0, **best_params)
# clf = clf.fit(X_train.values, y_train.values) 

# # Save model
# joblib.dump(clf, 'rf_model.joblib')

In [17]:
from sklearn.model_selection import cross_val_score
scores =  cross_val_score(logistic,X_select,y,cv=5)

In [9]:
# Fit random forest model
params = {
    'criterion': ['gini', 'entropy'], 
    'max_depth': [5, 10, 15], 
    'n_estimators': [100, 300, 500], 
}

clf = GridSearchCV(RandomForestClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

clf = RandomForestClassifier(random_state=0, **best_params)
clf = clf.fit(X_train.values, y_train.values) 

clf.score(X_test, y_test)

0.8680304853590052

In [15]:
# Fit logistic regression model
params = {
    'C': [1, 1.5, 2, 2.5, 3], 
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'max_iter': [75, 100, 125],
    'intercept_scaling': [.5, 1, 1.5, 2]
}

clf = GridSearchCV(LogisticRegression(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

clf = LogisticRegression(random_state=0, **best_params)
clf = clf.fit(X_train.values, y_train.values) 

clf.score(X_test, y_test)

0.8126754913758524

In [18]:
from sklearn.ensemble import AdaBoostClassifier 

params = {
    'n_estimators': [10, 100, 200],
    'learning_rate': [0.25, 0.5, 1.0, 2.0],
}    

clf = GridSearchCV(AdaBoostClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

clf = AdaBoostClassifier(random_state=0, **best_params) 
clf = clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.8576012835940634

In [8]:
# Fit decision tree model
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [2, 5, 10, 15, 20], 
}

clf = GridSearchCV(DecisionTreeClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

clf = DecisionTreeClassifier(random_state=0, **best_params)
clf = clf.fit(X_train.values, y_train.values) 

clf.score(X_test, y_test)

0.8547934215804252

In [None]:
from sklearn.neural_network import MLPClassifier

params = {
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'hidden_layer_sizes': [3, 5, 7],
    'learning_rate': ['contant', 'adaptive'],
}

clf = GridSearchCV(MLPClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

clf = MLPClassifier(random_state=0, **best_params)
clf = clf.fit(X_train.values, y_train.values) 

clf.score(X_test, y_test)

In [6]:
from sklearn.ensemble import VotingClassifier

print(f'Fitting Random Forest Classifier')
params = {
    'criterion': ['gini', 'entropy'], 
    'max_depth': [5, 10, 15], 
    'n_estimators': [400, 500, 600],
}

clf = GridSearchCV(RandomForestClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

rf_clf = RandomForestClassifier(random_state=0, **best_params)
rf_clf = rf_clf.fit(X_train, y_train) 


print(f'Fitting AdaBoost Classifier')
params = {
    'n_estimators': [10, 100, 200],
    'learning_rate': [0.5, 1.0, 1.5, 2.0],
}    

clf = GridSearchCV(AdaBoostClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

ada_clf = AdaBoostClassifier(random_state=0, **best_params) 
ada_clf = ada_clf.fit(X_train, y_train) 

print(f'Fitting Neural Network Classifier')
params = {
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'hidden_layer_sizes': [6, 7, 8],
    'learning_rate': ['contant', 'adaptive'],
}

clf = GridSearchCV(MLPClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

nn_clf = MLPClassifier(random_state=0, **best_params)
nn_clf = clf.fit(X_train.values, y_train.values) 

print(f'Fitting Voting Classifier')
params = {
    'voting': ['hard', 'soft'],
    'weights': [[0.34, 0.33, 0.33], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.3, 0.3, 0.4], [0.4, 0.4, 0.2], [0.4, 0.2, 0.4], [0.2, 0.4, 0.4], [0.35, 0.35, 0.3]]
}    

eclf = VotingClassifier(
    estimators=[('rf', rf_clf), ('ada', ada_clf), ('nn', nn_clf)], 
)

clf = GridSearchCV(eclf, params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

eclf = VotingClassifier(
    estimators=[('rf', rf_clf), ('ada', ada_clf), ('nn', nn_clf)],
    **best_params,
)

eclf = eclf.fit(X_train, y_train)
eclf.score(X_test, y_test)

Fitting Random Forest Classifier
Fitting AdaBoost Classifier
Fitting Neural Network Classifier
Fitting Voting Classifier


0.8688327316486161

In [5]:
best_params

{'voting': 'hard', 'weights': [0.3, 0.33, 0.17, 0.2]}

In [5]:
joblib.dump(eclf, 'rf_model.joblib')

['rf_model.joblib']

In [6]:
new = pd.read_csv(data_filepath)
features = [col for col in new.columns if col != target]

raw_payload = new[features].loc[15].to_dict()
raw_payload

{'CreditScore': 667,
 'Geography': 'Germany',
 'Gender': 'Male',
 'Age': 55,
 'Tenure': 9,
 'Balance': 154393.43,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 137674.96}

In [7]:
outcome = {}
for i in range(0,20):
    features = [col for col in new.columns if col != target]

    raw_payload = new[features].loc[i].to_dict()

    with open('dummy_column_mapper.json') as fin:
        dummy_column_mapper = json.load(fin)

    with open('scaler_info.json') as fin:
        scaler_info = json.load(fin)

    with open('col_order.json') as fin:
        col_order = json.load(fin)

    payload = dict(raw_payload)
    for column, dummy_columns in dummy_column_mapper.items():
        for dummy_column in dummy_columns:
            payload[dummy_column] = 0
        if column in payload:
            column_val = payload.pop(column)
            target_column = f'{column}_{column_val}'
            payload[target_column] = 1

    for key, scaler_params in scaler_info.items():
        if key in payload:
            payload[key] = (payload[key] - scaler_params['mean'])/scaler_params['std']
        else:
            payload[key] = scaler_params['mean']

    ordered_payload = {}
    for col in col_order:
        ordered_payload[col] = payload[col]

    outcome[i+1] = int(clf.predict(np.array(list(ordered_payload.values())).reshape(1, -1)))

In [8]:
outcome

{1: 0,
 2: 0,
 3: 0,
 4: 0,
 5: 0,
 6: 1,
 7: 0,
 8: 0,
 9: 0,
 10: 1,
 11: 0,
 12: 0,
 13: 0,
 14: 0,
 15: 1,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 20: 0}

In [72]:
import requests

In [None]:
base_endpoint = 'http://597deployment.azurewebsites.net/'

In [None]:
r = requests.get(base_endpoint)
r.text

In [None]:
predict_endpoint = 'http://597deployment.azurewebsites.net/predict'

In [None]:
r = requests.post(predict_endpoint, json=raw_payload)
int(r.text)

In [None]:
new_data = pd.read_csv(data_filepath)

for ckey in new_data.index.tolist()[:100]:
    raw_payload = new_data.loc[ckey].to_dict()
    target = raw_payload.pop('HeartDisease')
    
    r = requests.post(predict_endpoint, json=raw_payload)
    prediction = int(r.text)
    if prediction == target:
        correct_statement = 'Correct!!!'
    else:
        correct_statement = ''

    print(f'Instance {ckey}: actual->{target}, prediction->{prediction}. {correct_statement}')

In [None]:
export FLASK_ENV=development