# JupyterHub Notebook

### This notebook server is hosted on the OpenShift platform which provides a separate server for each individual user. The platform takes care of the provisioning of the server and allocating related to storage.

### First, install and import required libraries and watermark our file - to show what libraries and versions we're using. Then define utility functions to integrate with our Object storage and _Verta_ visualisation server.

In [None]:
import os

In [None]:
# %horus requirements --add matplotlib

%pip install matplotlib
%pip install pandas==1.1.5
%pip install scikit-learn
%pip install minio
%pip install seaborn
%pip install mlflow
%pip install ipynbname
%pip install category_encoders==2.2.2
%pip install joblib
%pip install watermark
%pip install boto3



In [None]:
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
# import pandas as pd
# import modin.pandas as pd

import watermark
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from minio import Minio
from minio.error import ResponseError
import os
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline


import category_encoders as ce

# import tools as tools
%matplotlib inline
%load_ext watermark

In [None]:
%watermark -n -v -m -g -iv


## Load Lab parameters.

In [None]:
from config import get_params, download_csv_files
user_id,PROJECT_NAME,EXPERIMENT_NAME,experiment_name, s3BucketFullPath = get_params()

In [None]:
### In this next section, we initialise our variables and our Object Storage implementation, Minio

In [None]:
HOST = "http://mlflow:5500"

#PROJECT_NAME = "CustomerChurnUser60"
#EXPERIMENT_NAME = "CustomerChurnUser60"

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='mlflow'

dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d%Y%H%M%S%f")
experiment_id = experiment_name + timestampStr


def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key=os.environ['AWS_ACCESS_KEY_ID'],
                    secret_key=os.environ['AWS_SECRET_ACCESS_KEY'],
                    secure=False)

    return minioClient


import mlflow

# Connect to local MLflow tracking server
mlflow.set_tracking_uri(HOST)

# Set the experiment name...
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.sklearn.autolog(log_input_examples=True)

### In this next section, we pull in the merged CSV file prepared earlier by the data engineer. 

In [None]:
minioClient = get_s3_server()
# data_file = minioClient.fget_object("data", s3BucketFullPath, "/tmp/data.csv")
# data_file_version = data_file.version_id
download_csv_files(minioClient, s3BucketFullPath)
data = pd.read_csv('/tmp/data.csv')
data.head(5)


In [None]:
### Use pandas.DataFrame functions
### - _shape_ to return the dimensionality
### - _info_ to print a concise summary of the DataFrame
### - _describe_ to generate descriptive statistics of the DataFrame's columns
### - _isnull().sum()_ to sum the empty values
### - finally determine Churn and Total Changes 


In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
# Convert binary variable into numeric so plotting is easier. We need to later take mean
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
data.replace(" ", np.nan, inplace=True)

In [None]:
data.isna().sum()

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [None]:
mean = data['TotalCharges'].mean()
data.fillna(mean, inplace=True)
# Now we know that total charges has nan values
data.isna().sum()

## Feature Engineering pipeline
### Use category_encoder's Ordinal encoding method which uses a single column of integers to represent the classes - then fit that to our 2 dimensional data imported earlier. Then pickle it and transform it. Then use Onehot (or dummy) coding for categorical features, producing one feature per category, each binary.


In [None]:
import category_encoders as ce
import joblib

names = ['gender', 'Partner', 'Dependents', 'PhoneService', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
# for column in names:
#     labelencoder(column)

enc = ce.OrdinalEncoder(cols=names)
enc.fit(data)
joblib.dump(enc, 'enc.pkl')
labelled_set = enc.transform(data)
labelled_set.tail(5)

In [None]:

names = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'OnlineSecurity', 'OnlineBackup',
         'DeviceProtection', 'TechSupport']

ohe = ce.OneHotEncoder(cols=names)
ohe.fit(labelled_set)
joblib.dump(ohe, 'ohe.pkl')
final_set = ohe.transform(labelled_set)
final_set.tail(5)

In [None]:
### Now we use scikit-learn's 'train_test_split' function to randomly split our data into training and testing sets. Then remove the _Churn_ and _customerID_ fields from our training and testing datasets and output the shaope of our data.

In [None]:
labels = final_set['Churn']
X_train, X_test, y_train, y_test = train_test_split(final_set, labels, test_size=0.2)
X_train.pop('Churn')
X_train.pop('customerID')
X_test.pop('Churn')
X_test.pop('customerID')
print ('Training Data Shape',X_train.shape, y_train.shape)
print ('Testing Data Shape',X_test.shape, y_test.shape)

In [None]:

# Data For cross validation and GridSearch
Y = final_set['Churn']
X = final_set.drop(['Churn', 'customerID'], axis=1)
print ('Training Data Shape', X.shape)
print ('Testing Data Shape', Y.shape)

### Create DecisionTreeClassifier object, extract hyper parameters, and then GridSearch will best_model from the various inputs

In [None]:
# Create decision tree object
DT = DecisionTreeClassifier()
# List of parameters
# entropy
criterion = ['gini']
max_depth = [5,10,15]
min_samples_split = [2,4,6]
min_samples_leaf = [4,5,6,8]
# Save all the lists in the variable
hyperparameters = dict(max_depth=max_depth, criterion=criterion,min_samples_leaf = min_samples_leaf ,min_samples_split = min_samples_split)

In [None]:
model = GridSearchCV(DT, hyperparameters, cv=5, verbose=0)
best_model = model.fit(X,Y)

In [None]:
# Mean cross validated score
print('Mean Cross-Validated Score: ',best_model.best_score_)
print('Best Parameters',best_model.best_params_)
# You can also print the best penalty and C value individually from best_model.best_estimator_.get_params()
print('Best criteria:', best_model.best_estimator_.get_params()['criterion'])
print('Best depth:', best_model.best_estimator_.get_params()['max_depth'])

### Use K-Folds cross-validator to split data in train/test sets. Create a dictionary of hyperparameter candidates, train model using a DecisionTreeClassifier, assess results, print and store hyper parameters and accuracy and tag using 'DecisionTreeClassifier'


In [None]:
kfold = KFold(n_splits = 3)
hyperparameters = dict(max_depth=5, criterion='gini',min_samples_leaf = 3 ,min_samples_split = 10)
model = DecisionTreeClassifier(max_depth=5, criterion='gini',min_samples_leaf = 3 ,min_samples_split = 10)
model = model.fit(X_train, y_train)
joblib.dump(model, 'dct.pkl')
results = model_selection.cross_val_score(model,X,Y,cv = kfold)
print(results)
print('Accuracy',results.mean()*100)


### Like before, in this next section, on the third line, change experiment_name by appending your username to _customerchurn_, e.g., if your username is user1: 
#### experiment_name = "customerchurnuser1"
### Create RandomForestClassifier object, extract hyper parameters, and then the best_model

In [None]:
dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d%Y%H%M%S%f")
experiment_name = "customerchurnuser29"
experiment_id = experiment_name + timestampStr


# Create random forest object
RF = RandomForestClassifier()
n_estimators = [18,22]
criterion = ['gini', 'entropy']
# Create a list of all of the parameters
max_depth = [30,40,50]
min_samples_split = [6,8]
min_samples_leaf = [8,10,12]
# Merge the list into the variable
hyperparameters = dict(n_estimators = n_estimators,max_depth=max_depth, criterion=criterion,min_samples_leaf = min_samples_leaf ,min_samples_split = min_samples_split)
# Fit your model using gridsearch
model = GridSearchCV(RF, hyperparameters, cv=5, verbose=0)
best_model = model.fit(X, Y)

### Extract best scores, params, criteria and depth from our model. 

In [None]:
# Mean cross validated score
print('Mean Cross-Validated Score: ',best_model.best_score_)
print('Best Parameters',best_model.best_params_)
# You can also print the best penalty and C value individually from best_model.best_estimator_.get_params()
print('Best criteria:', best_model.best_estimator_.get_params()['criterion'])
print('Best depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best estimator:', best_model.best_estimator_.get_params()['n_estimators'])


### As above, use K-Folds cross-validator to split data in train/test sets. Create a dictionary of hyperparameter candidates, train model using a RandomForestClassifier, assess results, print and store hyper parameters and accuracy and tag using 'RandomForestClassifier'

In [None]:
kfold = KFold(n_splits = 3)
hyperparameters = dict(max_depth=40, criterion='gini',min_samples_leaf = 12 ,min_samples_split = 8, n_estimators = 22)
model = RandomForestClassifier(max_depth=40, criterion='gini',min_samples_leaf = 12 ,min_samples_split = 8, n_estimators = 22)
model = model.fit(X_train, y_train)
joblib.dump(model, 'rft.pkl')
results = model_selection.cross_val_score(model,X,Y,cv = kfold)
print(results)
print('Accuracy',results.mean()*100)
# store = get_meta_store()
# store.log_hyperparameters(hyperparameters)
# store.log_model(model)
# store.log_metric('Accuracy',results.mean()*100)
# store.log_tag("RandomForestClassifier")
# store.log_attribute("data_file_location", "data/full_data_csv/a.csv")
# store.log_attribute("data_file_version", data_file_version)

In [None]:
print('Notebook complete')