# JupyterHub Notebook

### This notebook server is hosted on the OpenShift platform which provides a separate server for each individual user. The platform takes care of the provisioning of the server and allocating related to storage.

### First, install and import required libraries and watermark our file - to show what libraries and versions we're using. Then define utility functions to integrate with our Object storage and _Verta_ visualisation server.

In [1]:
%pip install dill

# from alibi.explainers import AnchorTabular
# os.environ["MODIN_ENGINE"] = "ray"


You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import dill



## Load Lab parameters. Before running this cell, ensure you set the s3BucketFullPath value to your merged file. This is done in hyper_parameters.py

In [3]:
from config import get_params, download_csv_files
user_id,PROJECT_NAME,EXPERIMENT_NAME,experiment_name, s3BucketFullPath = get_params()

STUDENT CONFIGURATION
User ID: "user29"
Project name: "CustomerChurn-user29"
Experiment name: "CustomerChurn-user29", "customerchurn-user29"
S3 Bucket full storage path: "full_data_csv-user29"


### Read the Merged Data

In [4]:
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
# import pandas as pd
# import modin.pandas as pd
from datetime import datetime
import watermark
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from minio import Minio
from minio.error import ResponseError
import os
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline


# import tools as tools
%matplotlib inline
%load_ext watermark

In [5]:
%watermark -n -v -m -g -iv


Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.29.0

Compiler    : GCC 8.5.0 20210514 (Red Hat 8.5.0-3)
OS          : Linux
Release     : 4.18.0-305.62.1.el8_4.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 16
Architecture: 64bit

Git hash: 494cd3884479e49abb7866a1e7019393e50b3f97

pandas    : 1.3.5
watermark : 2.3.0
numpy     : 1.22.0
matplotlib: 3.5.1
sklearn   : 1.0
dill      : 0.3.6



### In this next section, we initialise our variables and our Object Storage implemenation, Minio

In [6]:
HOST = "http://mlflow:5500"

# PROJECT_NAME = "CustomerChurnUser60"
# EXPERIMENT_NAME = "CustomerChurnUser60"

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='mlflow'

dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d%Y%H%M%S%f")
# experiment_name = "customerchurnuser29"
experiment_id = experiment_name + timestampStr

def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key=os.environ['AWS_ACCESS_KEY_ID'],
                    secret_key=os.environ['AWS_SECRET_ACCESS_KEY'],
                    secure=False)

    return minioClient


import mlflow

# Connect to local MLflow tracking server
mlflow.set_tracking_uri(HOST)

# Set the experiment name...
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.sklearn.autolog(log_input_examples=True)


# Example - pushing sample training data to registry
mlflow.log_artifact("../../data-files/training-data.csv")


### In this next section, we pull in the merged CSV file prepared earlier by the data engineer. 

In [7]:
minioClient = get_s3_server()
download_csv_files(minioClient, s3BucketFullPath)
# data_file = minioClient.fget_object("data", s3BucketFullPath, "/tmp/data.csv")
# data_file_version = data_file.version_id
data = pd.read_csv('/tmp/data.csv')
data.head(5)


<Object: bucket_name: data object_name: b'full_data_csv-user29/part-00000-80d2f4e4-30eb-44ef-8211-67022a858dea-c000.csv' version_id: None last_modified: 2023-06-07 03:35:47.245000+00:00 etag: 1fcb650bfc1826ac4bec4a31160e66b9 size: 928535 content_type: None is_dir: False metadata: None >


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,148,Male,0,No,No,1,Yes,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,45.65,45.65,Yes
1,463,Male,0,Yes,Yes,4,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,No,Electronic check,101.15,385.9,Yes
2,471,Female,1,No,No,17,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,20.65,330.6,No
3,496,Male,0,No,No,22,No,No phone service,DSL,No,...,Yes,No,No,Yes,One year,Yes,Bank transfer (automatic),43.75,903.6,Yes
4,833,Female,0,Yes,Yes,70,Yes,No,DSL,Yes,...,Yes,Yes,No,Yes,One year,No,Credit card (automatic),74.1,5222.3,No


In [8]:
# Convert binary variable into numeric so plotting is easier. We need to later take mean
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

In [9]:
data.replace(" ", np.nan, inplace=True)


In [10]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [11]:
mean = data['TotalCharges'].mean()
data.fillna(mean, inplace=True)
# Now we know that total charges has nan values
data.isna().sum()



customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [12]:
import category_encoders as ce
import joblib

names = ['gender', 'Partner', 'Dependents', 'PhoneService', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']
# for column in names:
#     labelencoder(column)
data_enc = data
data_enc = data_enc.drop(['Churn', 'customerID'], axis=1)
data_enc.head(1)
enc = ce.ordinal.OrdinalEncoder(cols=names)
enc.fit(data_enc)
joblib.dump(enc, 'CustomerChurnOrdinalEncoder.pkl')
mlflow.log_artifact("CustomerChurnOrdinalEncoder.pkl")
labelled_set = enc.transform(data_enc)
labelled_set.tail(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
7038,1,0,1,1,1,1,No,No,No internet service,No internet service,No internet service,No internet service,3,3,Month-to-month,1,Mailed check,18.85,18.85
7039,2,0,1,1,1,1,Yes,Fiber optic,No,No,No,No,1,1,Month-to-month,1,Electronic check,74.5,74.5
7040,1,0,2,1,69,1,No,DSL,Yes,Yes,No,No,1,1,One year,1,Credit card (automatic),53.65,3804.4
7041,1,0,2,2,70,1,Yes,DSL,No,Yes,Yes,Yes,2,2,Two year,1,Electronic check,84.1,5979.7
7042,2,0,1,1,3,1,No,DSL,Yes,Yes,No,Yes,1,2,Two year,2,Mailed check,71.1,213.35


In [13]:

names = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'OnlineSecurity', 'OnlineBackup',
         'DeviceProtection', 'TechSupport']

ohe = ce.OneHotEncoder(cols=names)
data_ohe = data
data_ohe = data_ohe.drop(['Churn', 'customerID'], axis=1)
data_ohe.head(1)
ohe.fit(data_ohe)
joblib.dump(ohe, 'CustomerChurnOneHotEncoder.pkl')
mlflow.log_artifact("CustomerChurnOneHotEncoder.pkl")
final_set = ohe.transform(labelled_set)
final_set.tail(5)
labelled_set.shape

  elif pd.api.types.is_categorical(cols):


(7043, 19)

In [14]:
labels = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(final_set, labels, test_size=0.2)
print ('Training Data Shape',X_train.shape, y_train.shape)
print ('Testing Data Shape',X_test.shape, y_test.shape)

Training Data Shape (5634, 36) (5634,)
Testing Data Shape (1409, 36) (1409,)


In [15]:

Y = data['Churn']
X = final_set

y_train = y_train.astype(int)
y_test = y_test.astype(int)
Y = Y.astype(int)


y_train.head(10)


3269    1
1398    0
4427    0
5756    0
2474    0
5375    0
606     0
4411    0
5106    1
1405    0
Name: Churn, dtype: int64

### In this next section, we define the method train_and_save_model() where we train and then push our model to Verta - for visualisation 

In [16]:
def train_and_save_model():
    kfold = KFold(n_splits = 3)
    model = DecisionTreeClassifier(max_depth=5, criterion='gini',min_samples_leaf = 3 ,min_samples_split = 10)
    model = model.fit(X_train, y_train)
    joblib.dump(model, 'CustomerChurnPredictor.sav')
    results = model_selection.cross_val_score(model,X,Y,cv = kfold)
    print(results)
    print('Accuracy',results.mean()*100)


    return model

### In this next section, we define the method explain_model(), where we make available an *_explanation_* of the reasons the model made the decisions it did. This is very useful for auditing purposes as well as for the Application development consumers of the model - who can optionally expand and utilise these reasons for their purposes.

In [17]:
# from alibi.utils.data import gen_category_map

# def explain_model(model, X_train, X_test_record):
#     fnames = X_train.columns.tolist()
#     predict_fn = lambda x: model.predict_proba(x)
#     explainer = AnchorTabular(predict_fn, fnames)
#     explainer = explainer.fit(X_train.values, disc_perc=[25, 50, 75])
#     explanation = explainer.explain(X_test_record.values[0])
#     print('Anchor: %s' % explanation['anchor'])
#     print('Precision: %.2f' % explanation['precision'])
#     print('Coverage: %.2f' % explanation['coverage'])
#     return explainer

In [18]:
model = train_and_save_model()
# explainer = explain_model(model, X_train, X_test)
# with open("CustomerChurnPredictorAlibi.dill", "wb") as x_f:
#     dill.dump(explainer, x_f)



[0.7802385  0.8032368  0.78909246]
Accuracy 79.08559188612233


In [19]:
# minioClient = get_s3_server()
# minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnPredictor.sav' , file_path='./CustomerChurnPredictor.sav')
# # minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnPredictorAlibi.dill' , file_path='./CustomerChurnPredictorAlibi.dill')
# minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnOrdinalEncoder.pkl' , file_path='./CustomerChurnOrdinalEncoder.pkl')
# minioClient.fput_object(bucket_name='models', object_name=experiment_id  +'/CustomerChurnOneHotEncoder.pkl' , file_path='./CustomerChurnOneHotEncoder.pkl')


## Test Models

In [20]:
prediction = model.predict_proba([X_test.iloc[1]])
class_name = ['Not Churn', 'Churn']                                              
predicted_class =   np.argmax(prediction)                                    
print('Predicted Class name: ',class_name[predicted_class])
predicted_class_prob = np.max(prediction)
print('Predicted class Certainty: ', predicted_class_prob)


Predicted Class name:  Not Churn
Predicted class Certainty:  0.903954802259887




In [21]:
print('Notebook complete!')

Notebook complete!
