# Import packages and verify versions

In [None]:
import pandas as pd
import sklearn
print('The pandas version is {}.'.format(pd.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

# Load data

In [None]:

# @hidden_cell
# The following code contains the credentials for a file in your IBM Cloud Object Storage.
# You might want to remove those credentials before you share your notebook.
credentials_1 = {
    'IAM_SERVICE_ID': 'iam-ServiceId-210be2d3-af5e-43b6-9d74-0427ae8ae81a',
    'IBM_API_KEY_ID': '9FthwkkdExB6wrCH1HafvISYEZEeKAKEKpWfPg9TV2hM',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.bluemix.net/oidc/token',
    'BUCKET': 'pyrlabtest-donotdelete-pr-ecn1gj2kpu5dli',
    'FILE': 'customers.csv'
}


In [None]:

# @hidden_cell
# The following code contains the credentials for a file in your IBM Cloud Object Storage.
# You might want to remove those credentials before you share your notebook.
credentials_2 = {
    'IAM_SERVICE_ID': 'iam-ServiceId-210be2d3-af5e-43b6-9d74-0427ae8ae81a',
    'IBM_API_KEY_ID': '9FthwkkdExB6wrCH1HafvISYEZEeKAKEKpWfPg9TV2hM',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.bluemix.net/oidc/token',
    'BUCKET': 'pyrlabtest-donotdelete-pr-ecn1gj2kpu5dli',
    'FILE': 'transactions.csv'
}


In [None]:
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client_b6ce1d66a23747c685affa13595b2acb = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='9FthwkkdExB6wrCH1HafvISYEZEeKAKEKpWfPg9TV2hM',
    ibm_auth_endpoint="https://iam.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_b6ce1d66a23747c685affa13595b2acb.get_object(Bucket='pyrlabtest-donotdelete-pr-ecn1gj2kpu5dli',Key='customers.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

customers = pd.read_csv(body)
customers.head()



In [None]:
pd.set_option('display.max_columns', None)
customers.head()

In [None]:
body = client_b6ce1d66a23747c685affa13595b2acb.get_object(Bucket='pyrlabtest-donotdelete-pr-ecn1gj2kpu5dli',Key='transactions.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

transactions = pd.read_csv(body)
transactions.head()

In [None]:
print('Number of rows in customers = {}'.format(customers.CustomerID.count()))
print('Number of rows in trips = {}'.format(transactions.CustomerID.count()))

# Join Data

In [None]:
joined = customers.join(transactions.set_index('CustomerID'), on='CustomerID', how='inner').reset_index(drop=True)

In [None]:
print('Number of rows in joined = {}'.format(joined.CustomerID.count()))
joined.head()

# Analyze Data

In [None]:
joined.describe()
#Clear to see that the Monetary_score column contains data outliers

In [None]:
# The Frequency_score column should have been inferred as a numeric, so it may contain some unwanted non-numeric data
joined.dtypes.filter(items=['Frequency_score'])

# Data Cleansing

In [None]:
# Force the Frequency_score column to a numeric data type as it should be
joined[['Frequency_score']] = joined[['Frequency_score']].apply(pd.to_numeric, errors='coerce')

In [None]:
# Show rows with invalid data
print('Number of rows with invalid values = {}'.format(len(joined[joined.isnull().any(axis=1)])))
joined[joined.isnull().any(axis=1)]

In [None]:
#Drop rows with invalid data
joined = joined.dropna(axis=0)
print('Number of rows in joined = {}'.format(joined.CustomerID.count()))

In [None]:
#Frequency_score is float data type, but should be integer
joined.dtypes.filter(items=['Frequency_score'])

In [None]:
# Cast Frequency_score as integer
joined[['Frequency_score']] = joined[['Frequency_score']].astype(int)
joined.dtypes.filter(items=['Frequency_score'])

In [None]:
# Remove the rows with outliers in Monetary_score that we previously identified
joined = joined[joined['Monetary_score'] <=5]

In [None]:
joined.head()

In [None]:
joined['Monetary_score'].describe()

# Drop Columns not being used as Features

In [None]:
dropCols = (['CustomerID', 'Invest', 'Educ', 'MARITAL', 'TimeYears', 'lasttrans', 'current', 'Monetary_score'])
joined.drop(dropCols, axis=1, inplace=True)
joined.head()

# Convert numeric data to integer (some numeric columns were inferred as float64)

In [None]:
# Retire column was infered as a float data type
joined['Retire'].dtypes

In [None]:
joinedColNames = joined.columns.values.tolist()
intList = []
numericCols = []
for col in joinedColNames:
    if joined[col].dtypes == 'float64' or joined[col].dtypes == 'int64':
        numericCols.append(col)
        intList.append('int')
# Create a dictionary that will be used to set the numeric columns to integer type
intDict = dict(zip(numericCols, intList))
print("Show columns of integer data type")
intDict

In [None]:
# Conert the numeric columns to integer
joined = joined.astype(intDict)

In [None]:
# All numeric data is now int64
joined.dtypes

# Encode the string data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
#Encode the Churn label calling the new column CHURN and drop the original Churn column
le = LabelEncoder()
joined['CHURN']= le.fit_transform(joined['Churn'])
joined = joined.drop(['Churn'], axis = 1)
joined.head()

In [None]:
# Install sklearn-pandas package that will be used to endode the categorical features
!pip install sklearn-pandas

In [None]:
# Apply the LabelEncoder to encode the categorical features
from sklearn_pandas import DataFrameMapper

mapper = DataFrameMapper(
    [('Retire', None),
     ('Mortgage', LabelEncoder()),
     ('LOC', LabelEncoder()),
     ('GENDER', LabelEncoder()),
     ('CHILDREN',LabelEncoder()),
     ('WORKING', LabelEncoder()),
     ('HighMonVal',LabelEncoder()),
     ('AgeRange',LabelEncoder()),
     ('Frequency_score',None)
    ])

# Prepare the data for machine learning

In [None]:
# Split the label column out from the features dataframe
X = joined.drop('CHURN', axis = 1)
y = joined['CHURN']
# Sample the indexed DataFrame
X.sample(n=5, random_state=2)

In [None]:
y.to_frame().sample(n=5, random_state=10)

# Create training and test datasets¶

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_test.head(5)

In [None]:
# Show the number of rows in each data set
print('Number of rows in X_train is {}.'.format(X_train.shape[0]))
print('Number of rows in X_test is {}.'.format(X_test.shape[0]))
print('Number of rows in y_train is {}.'.format(y_train.shape[0]))
print('Number of rows in y_test is {}.'.format(y_test.shape[0]))

# Train the model

In [None]:
import sklearn.pipeline
from xgboost import XGBClassifier

XGBClassifier = XGBClassifier()
steps = [('mapper', mapper),('XGBClassifier', XGBClassifier)]
pipeline = sklearn.pipeline.Pipeline(steps)
#model=pipeline.fit( X_train, y_train )

#Train the model
model = (pipeline.fit(X_train, y_train, XGBClassifier__eval_metric='error', 
        XGBClassifier__eval_set=[((mapper.fit_transform(X_train), y_train)),(mapper.fit_transform(X_test), y_test)]))

# The eval_metric parameter specifies the evaluation metrics for validation data 
# Here we are using a Binary classification error rate. It is calculated as #(wrong cases)/#(all cases).
# For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances,
# and the others as negative instances.

In [None]:
# Show model training parameters
model.get_params()

# Check model accuracy

In [None]:
#import required modules from the scikit-learn metrics package
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
# Convert numpy array to list
predictions = y_pred.tolist()

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print('Accuracy: {:.1f}%'.format(accuracy * 100.0))

# Analyze Model - Feature Importance and Trees

In [None]:
print(XGBClassifier.feature_importances_)

In [None]:
[print('{0} importance = {1:.2f}'.format(X_train.columns.tolist()[x], XGBClassifier.feature_importances_[x])) for x in range(len(X_train.columns))]

In [None]:
from xgboost import plot_importance
plot_importance(XGBClassifier)

In [None]:
!pip install graphviz

# Investigate model

In [None]:
# Plot and display the performance evaluation
eval = model.named_steps['XGBClassifier'].evals_result()
eval_steps = range(len(eval['validation_0']['error']))

import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(1, 1, sharex=True, figsize=(8, 6))

ax.plot(eval_steps, [1-x for x in eval['validation_0']['error']], label='Train')
ax.plot(eval_steps, [1-x for x in eval['validation_1']['error']], label='Test')
ax.legend()
ax.set_title('Accuracy')
ax.set_xlabel('Number of iterations')

# Avoid Overfitting By Limiting Number of Trees

In [None]:
# ntree_limits the number of trees in the prediction; defaults to 0 (use all trees)
n_trees = 10
y_pred = model.named_steps['XGBClassifier'].predict(mapper.fit_transform(X_test), ntree_limit= n_trees)

In [None]:
# Check the accuracy of the trained model
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy: {:.1f}%'.format(accuracy * 100.0))

# Avoid Overfitting By Early Stopping

In [None]:
# Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training
# Returns the model from the last iteration (not the best one) 
rounds = 10
steps = [('mapper', mapper),('XGBClassifier', XGBClassifier)]
pipeline = sklearn.pipeline.Pipeline(steps)
model = (pipeline.fit(X_train, y_train, XGBClassifier__eval_metric='error', XGBClassifier__early_stopping_rounds=rounds,
        XGBClassifier__eval_set=[((mapper.fit_transform(X_train), y_train)),(mapper.fit_transform(X_test), y_test)]))

In [None]:
# Show best score
print('Best Score: {:.3f}'.format(XGBClassifier.best_score))
print('Best Iteration: {}'.format(XGBClassifier.best_iteration))

In [None]:
# Show best number of trees
print('Best Score: {}'.format(XGBClassifier.best_ntree_limit))

In [None]:
# Check the accuracy of the trained model with early stopping
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy: {:.1f}%.'.format(accuracy * 100.0))

# Plot Model Performance

In [None]:
# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  
# Plot the confusion matrix
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (Logistic Regression)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

In [None]:
# Print out AUC, the percentage of the ROC plot that is underneath the curve
print('AUC using XGBoost = {:.2f}'.format(roc_auc_score(y_test, y_pred)))

# Save and Deploy Model

In [None]:
#Import WML API
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [None]:
wml_credentials={
  "apikey": "iZ0kha1IQxdAI239b92utWHTkWhPdS2JwCIGH90nOIKU",
  "iam_apikey_description": "Auto-generated for key 065ebbd2-9b64-4479-b0fe-e6c1d9054cd1",
  "iam_apikey_name": "wdp-writer",
  "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
  "iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/d8ec35232337194375b709e39ae4e4e5::serviceid:ServiceId-5b9bf5c6-b42b-423b-a0ac-045ccb154d9e",
  "instance_id": "12199dc8-c416-4bce-b27e-f3cde36d59ac",
  "password": "296e26c3-fc25-47fe-8a88-1b4660ba81ed",
  "url": "https://us-south.ml.cloud.ibm.com",
  "username": "065ebbd2-9b64-4479-b0fe-e6c1d9054cd1"  
}

In [None]:
wml_client = WatsonMachineLearningAPIClient(wml_credentials)

In [None]:
model_props = {wml_client.repository.ModelMetaNames.AUTHOR_NAME: "Rich Tarro", 
               wml_client.repository.ModelMetaNames.AUTHOR_EMAIL: "mail@us.ibm.com", 
               wml_client.repository.ModelMetaNames.NAME: "RetailChurnXGBoost"}

published_model = wml_client.repository.store_model(model=model, meta_props=model_props,training_data=X_train, training_target=y_train)

![IBM Logo](http://www-03.ibm.com/press/img/Large_IBM_Logo_TN.jpg)

Rich Tarro  
Anaytics Technical Specialist  
email: rtarro@us.ibm.com
    
July 30, 2018