# Introduction - Customer Churn Prediction notebook
In this notebook, we illustrate how you can train a model for Churn Prediction using scikit learn. After training the model, you step through the instructions to deploy the model using Watson Machine Learning.

This notebook is a variation of the original notebook reference in this github repo: https://github.com/elenalowery/cpd4_demo/blob/master/assets/jupyterlab/Predict_Customer_Churn_CPD4.ipynb


In [17]:
print("Validating notebook using validation data - minor change - 1")

Validating notebook using validation data - minor change - 1


In [3]:
# Install required Python modules
!pip install sklearn-pandas


Collecting sklearn-pandas
  Downloading sklearn_pandas-2.2.0-py2.py3-none-any.whl (10 kB)
Collecting scipy>=1.5.1
  Downloading scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.3 MB)
[K     |████████████████████████████████| 39.3 MB 22.4 MB/s eta 0:00:01
Installing collected packages: scipy, sklearn-pandas
  Attempting uninstall: scipy
    Found existing installation: scipy 1.4.1
    Uninstalling scipy-1.4.1:
      Successfully uninstalled scipy-1.4.1
Successfully installed scipy-1.7.3 sklearn-pandas-2.2.0


## Step 1: Review Use Case
The analytics use case implemented in this notebook is telco churn prediction. It is a simple use case which illustrates typical process for model development and deployment using Cloud Pak for Data.

In [4]:
# @hidden_cell

from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space()

db2cloud_metadata = wslib.get_connection("db2cloud")

import os, ibm_db, ibm_db_dbi as dbi, pandas as pd

db2cloud_dsn = 'DATABASE={};HOSTNAME={};PORT={};PROTOCOL=TCPIP;UID={uid};PWD={pwd};SECURITY=SSL'.format(
    db2cloud_metadata['database'],
    db2cloud_metadata['host'],
    db2cloud_metadata.get('port', 50000),
    uid=db2cloud_metadata['username'],
    pwd=db2cloud_metadata['password']
)

db2cloud_connection = dbi.connect(db2cloud_dsn)
   
# NOTE:
#  A row limit has been applied to the query to enable sample previewing.
#  Adjust the display message and query as needed by editing the following lines:
from IPython.core.display import display, HTML
display(HTML("A row limit of 5000 has been applied to the query to enable sample previewing. If the data set is larger, only the first 5000 rows will be loaded."))
query = 'SELECT * FROM "XCV64422"."VALIDATION_DATA" FETCH FIRST 5000 ROWS ONLY'

data_df_7 = pd.read_sql_query(query, con=db2cloud_connection)
data_df_7.head()

# After use, close the database connection with the following code:
# db2cloud_connection.close()


Unnamed: 0,COLUMN_0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,812,2944,8,0,22,0,CC,FreeLocal,Standard,31,1,F,M,2,80087.7,N,41.92,F
1,916,3194,1,0,29,0,CC,Budget,Standard,31,4,F,S,2,93559.1,N,57.62,F
2,339,1824,2,1,43,0,CH,Budget,Intnl_discount,46,2,M,M,0,78894.2,N,34.0,F
3,192,1451,1,0,83,0,CC,FreeLocal,Intnl_discount,84,1,F,M,2,98003.7,Y,26.273333,F
4,203,1469,59,2,38,0,CC,FreeLocal,Standard,62,4,F,S,2,42000.0,N,18.0,T


In [5]:
# COPY the dataFrame into a new dataFrame called *data*
# Note that when you insert the data, it may get written to a different dataFrame name; in this example, it is assumed the dataFrame is data_df_1
data=data_df_7.copy()

In [6]:
# List all the columns
print(data.columns)

Index(['COLUMN_0', 'ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED',
       'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE',
       'RATEPLAN', 'GENDER', 'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER',
       'AGE', 'CHURN'],
      dtype='object')


In [7]:
# Keep only the columns that are relevant for churn prediction
data = data[['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER','STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CHURN']]
data.head()


Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,2944,8,0,22,0,CC,FreeLocal,Standard,31,1,F,M,2,80087.7,N,41.92,F
1,3194,1,0,29,0,CC,Budget,Standard,31,4,F,S,2,93559.1,N,57.62,F
2,1824,2,1,43,0,CH,Budget,Intnl_discount,46,2,M,M,0,78894.2,N,34.0,F
3,1451,1,0,83,0,CC,FreeLocal,Intnl_discount,84,1,F,M,2,98003.7,Y,26.273333,F
4,1469,59,2,38,0,CC,FreeLocal,Standard,62,4,F,S,2,42000.0,N,18.0,T


## Step 2: Build the Random Forest model

In [8]:
import pandas as pd
import sklearn
pd.options.display.max_columns = 999

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import chi2_contingency,ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score

import numpy as np

import urllib3, requests, json

In [9]:
#convert CHURN to 1/0
le = LabelEncoder()
data.loc[:,'CHURN']= le.fit_transform(data.loc[:,'CHURN'])
data.head()

Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,2944,8,0,22,0,CC,FreeLocal,Standard,31,1,F,M,2,80087.7,N,41.92,0
1,3194,1,0,29,0,CC,Budget,Standard,31,4,F,S,2,93559.1,N,57.62,0
2,1824,2,1,43,0,CH,Budget,Intnl_discount,46,2,M,M,0,78894.2,N,34.0,0
3,1451,1,0,83,0,CC,FreeLocal,Intnl_discount,84,1,F,M,2,98003.7,Y,26.273333,0
4,1469,59,2,38,0,CC,FreeLocal,Standard,62,4,F,S,2,42000.0,N,18.0,1


In [10]:
# define the label and features
y = np.float32(data.CHURN)
x = data.drop(['CHURN'], axis = 1)

In [11]:
x.columns

Index(['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER',
       'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE'],
      dtype='object')

In [12]:
# Apply the LabelEncoder to encode the input features in numeric form where applicable
from sklearn_pandas import DataFrameMapper

mapper = DataFrameMapper(
    [('GENDER', LabelEncoder()),
     ('STATUS', LabelEncoder()),
     ('CHILDREN', None),
     ('ESTINCOME',None),
     ('CAROWNER', LabelEncoder()),
     ('AGE',None),
     ('LONGDISTANCE',None),
     ('INTERNATIONAL',None),
     ('LOCAL',None),
     ('DROPPED',None),
     ('PAYMETHOD',LabelEncoder()),
     ('LOCALBILLTYPE',LabelEncoder()),
     ('LONGDISTANCEBILLTYPE',LabelEncoder()),
     ('USAGE',None),
     ('RATEPLAN',None)
    ]
)

In [13]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
# fit the model

import sklearn.pipeline
from sklearn.preprocessing import OneHotEncoder

random_forest = RandomForestClassifier()
steps = [('mapper', mapper),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)
model=pipeline.fit( X_train, y_train )
model

Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[('GENDER', LabelEncoder()),
                                           ('STATUS', LabelEncoder()),
                                           ('CHILDREN', None),
                                           ('ESTINCOME', None),
                                           ('CAROWNER', LabelEncoder()),
                                           ('AGE', None),
                                           ('LONGDISTANCE', None),
                                           ('INTERNATIONAL', None),
                                           ('LOCAL', None), ('DROPPED', None),
                                           ('PAYMETHOD', LabelEncoder()),
                                           ('LOCALBILLTYPE', LabelEncoder()),
                                           ('LONGDISTANCEBILLTYPE',
                                            LabelEncoder()),
                               

In [15]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()

report = sklearn.metrics.classification_report( y_test, y_prediction )
### and print the report
print(report)

              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94        57
         1.0       0.94      0.86      0.90        37

    accuracy                           0.93        94
   macro avg       0.93      0.91      0.92        94
weighted avg       0.93      0.93      0.92        94

