# JupyterHub Notebook

### This notebook server is hosted on the OpenShift platform which provides a separate server for each individual user. The platform takes care of the provisioning of the server and allocating related to storage.

### First, install and import required libraries and watermark our file - to show what libraries and versions we're using. Then define utility functions to integrate with our Object storage and _Verta_ visualisation server.

In [None]:
import os
import dill

import verta.integrations.sklearn
from alibi.explainers import AnchorTabular
# os.environ["MODIN_ENGINE"] = "ray"


In [None]:
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
# import pandas as pd
# import modin.pandas as pd
from datetime import datetime
import watermark
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from minio import Minio
from verta import Client
from minio.error import ResponseError
import os
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline


# import tools as tools
%matplotlib inline
%load_ext watermark

In [None]:
%watermark -n -v -m -g -iv


### In this next section, on the third line, change experiment_name by appending your username to _customerchurn_, e.g., if your username is user1: 
#### experiment_name = "customerchurnuser1"

In [None]:
dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d%Y%H%M%S%f")
experiment_name = "customerchurnuser29"
experiment_id = experiment_name + timestampStr


def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)

    return minioClient

def get_verta():
    client = Client("http://172.30.21.193:3000")
    return client

def get_meta_store():
    client = get_verta()
    proj = client.set_project("ml-workshop")
    client.set_experiment(experiment_name)
    run = client.set_experiment_run(experiment_id)
    return run




### In this next section, on the second line, insert the value you retrieved from Minio object storage earlier - representing the fully qualified name of your csv file in Minio. This is the file pushed by the data engineer in the format: full_data_csv{USERNAME}/{FILENAME}.csv. 
#### In my case this value is: full_data_csvuser29/part-00000-59149e08-583c-46a5-bfa0-0b3abecbf1a3-c000.csv (yours will be different)
### We refer to this fully qualified name in the Github instructions as CSV-FILE

In [None]:
minioClient = get_s3_server()
data_file = minioClient.fget_object("data", "full_data_csvuser29/part-00000-b6e27ca2-1458-4b44-ba29-b88aa9e90186-c000.csv", "/tmp/data.csv")
data_file_version = data_file.version_id
data = pd.read_csv('/tmp/data.csv')
data = data.dropna(axis=0, subset=['Churn'])
data.head(5)


In [None]:
# Convert binary variable into numeric so plotting is easier. We need to later take mean
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
data.replace(" ", np.nan, inplace=True)


In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [None]:
mean = data['TotalCharges'].mean()
data.fillna(mean, inplace=True)
# Now we know that total charges has nan values
data.isna().sum()



In [None]:
import category_encoders as ce
import joblib

names = ['gender', 'Partner', 'Dependents', 'Premium', 'HomeEquityLoans', 'MoneyMarketAccount', 'PaperlessBilling']


# for column in names:
#     labelencoder(column)
data_enc = data
data_enc = data_enc.drop(['Churn', 'customerID'], axis=1)
data_enc.head(1)
enc = ce.ordinal.OrdinalEncoder(cols=names)
enc.fit(data_enc)
joblib.dump(enc, 'CustomerChurnOrdinalEncoder.pkl')
labelled_set = enc.transform(data_enc)
labelled_set.tail(5)

In [None]:

names = ['RelationshipManager', 'PrimaryChannel', 'CreditRating', 'AccountType', 'HasCreditCard', 'DebitCard',
         'IncomeProtection', 'WealthManagement']


ohe = ce.OneHotEncoder(cols=names)
data_ohe = data
data_ohe = data_ohe.drop(['Churn', 'customerID'], axis=1)
data_ohe.head(1)
ohe.fit(data_ohe)
joblib.dump(ohe, 'CustomerChurnOneHotEncoder.pkl')
final_set = ohe.transform(labelled_set)
final_set.tail(5)
labelled_set.shape

In [None]:
labels = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(final_set, labels, test_size=0.2)
print ('Training Data Shape',X_train.shape, y_train.shape)
print ('Testing Data Shape',X_test.shape, y_test.shape)

In [None]:

Y = data['Churn']
X = final_set



### In this next section, we define the method train_and_save_model() where we train and then push our model to Verta - for visualisation 

In [None]:
from sklearn.tree import DecisionTreeRegressor

def train_and_save_model():
    kfold = KFold(n_splits = 3)
    # model = DecisionTreeRegressor(max_depth=5, criterion='mse',min_samples_leaf = 3 ,min_samples_split = 10)
    model = DecisionTreeClassifier()
    store = get_meta_store()
    model = model.fit(X_train, y_train, run=store)
    joblib.dump(model, 'CustomerChurnPredictor.sav')
    results = model_selection.cross_val_score(model,X,Y,cv = kfold)
    print(results)
    print('Accuracy',results.mean()*100)

    store.log_model(model, overwrite=True)
    store.log_metric('Accuracy',results.mean()*100)
    store.log_tag("DecisionTreeClassifier")
    store.log_attribute("data_file_location", "data/full_data_csv/a.csv")
    store.log_attribute("data_file_version", data_file_version)

    return model

### In this next section, we define the method explain_model(), where we make available an *_explanation_* of the reasons the model made the decisions it did. This is very useful for auditing purposes as well as for the Application development consumers of the model - who can optionally expand and utilise these reasons for their purposes.

In [None]:
from alibi.utils.data import gen_category_map

def explain_model(model, X_train, X_test_record):
    fnames = X_train.columns.tolist()
    predict_fn = lambda x: model.predict_proba(x)
    explainer = AnchorTabular(predict_fn, fnames)
    explainer = explainer.fit(X_train.values, disc_perc=[25, 50, 75])
    explanation = explainer.explain(X_test_record.values[0])
    print('Anchor: %s' % explanation['anchor'])
    print('Precision: %.2f' % explanation['precision'])
    print('Coverage: %.2f' % explanation['coverage'])
    return explainer

In [None]:
model = train_and_save_model()
# explainer = explain_model(model, X_train, X_test)
# with open("CustomerChurnPredictorAlibi.dill", "wb") as x_f:
#     dill.dump(explainer, x_f)

In [None]:
minioClient = get_s3_server()
minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnPredictor.sav' , file_path='./CustomerChurnPredictor.sav')
# minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnPredictorAlibi.dill' , file_path='./CustomerChurnPredictorAlibi.dill')
minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnOrdinalEncoder.pkl' , file_path='./CustomerChurnOrdinalEncoder.pkl')
minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnOneHotEncoder.pkl' , file_path='./CustomerChurnOneHotEncoder.pkl')


In [None]:
print('Notebook complete')