# Credit Card Approval


Heba El-Shimy  
IBM **Cloud** Developer Advocate


<sub>GitHub: HebaNAS</sub>  
<sub>Twitter: @heba_el_shimy</sub>

# Pipeline

### 1. Loading Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, StandardScaler
import sklearn.feature_selection
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn import metrics

### 2. Loading Our Dataset

In [None]:
# Insert your pandas DataFrame here


In [None]:
# Checking that everything is correct
pd.set_option('display.max_columns', 30)
applicants.head(10)

### 3. Get some info about our Dataset and whether we have missing values

In [None]:
# After running this cell we will see that we have no missing values
applicants.info()

In [None]:
# Convert columns with numbers as values but object as datatype into numeric
cols = [1, 13]

# Set error level to coerce so any string value will be replaced with NaN
applicants[cols] = applicants[cols].apply(pd.to_numeric, errors='coerce')
applicants.head(10)

In [None]:
# Check if we have any NaN values
applicants.isnull().values.any()

In [None]:
# Handle missing values using scikit learn Imputer
from sklearn.preprocessing import Imputer

# Define the values to replce and the strategy of choosing the replacement value
imp = Imputer(missing_values="NaN", strategy="mean")

applicants[cols] = imp.fit_transform(applicants[cols])
applicants.head(10)

In [None]:
# Check if we have any NaN values
applicants.isnull().values.any()

In [None]:
applicants.info()

### 4. Descriptive analytics for our data

In [None]:
# Describe columns with numerical values
pd.set_option('precision', 3)
applicants.describe()

In [None]:
# Find correlations
applicants.corr(method='pearson')

### 5. Visualize our Data to understand it better

#### Plot Relationships

In [None]:
# Create Grid for pairwise relationships
gr = sns.PairGrid(applicants, size=5, hue=15)
gr = gr.map_diag(plt.hist)
gr = gr.map_offdiag(plt.scatter)
gr = gr.add_legend()

#### Understand Data Distribution

In [None]:
# Set up plot size
fig, ax = plt.subplots(figsize=(20,10))

# Attributes destribution
a = sns.boxplot(orient="v", palette="hls", data=applicants.iloc[:, :13], fliersize=14)

In [None]:
# Tenure data distribution
histogram = sns.distplot(applicants.iloc[:, 1], hist=True)
plt.show()

### 6. Encode string values in data into numerical values

In [None]:
# Use pandas get_dummies
applicants_encoded = pd.get_dummies(applicants)
applicants_encoded.head(10)

### 7. Create Training Set and Labels 

In [None]:
# Create training data for non-preprocessed approach
X_npp = applicants_encoded.iloc[:, :-2]
pd.DataFrame(X_npp).head(10)

In [None]:
# Create training data for that will undergo preprocessing
X = applicants_encoded.iloc[:, :-2]
X.head()

In [None]:
# Extract labels
from sklearn.preprocessing import LabelEncoder

# Split last column from original dataset as the labels column
y = applicants[15]

# Apply encoder to transform strings to numeric values 0 and 1
le = LabelEncoder().fit(y)

y_enc = le.transform(y)
pd.DataFrame(y_enc).head(10)

### 8. Detect outliers in numerical values

In [None]:
# Detect outlier using interquartile method and remove them
def find_outliers(df):
    quartile_1, quartile_3 = np.percentile(df, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)

    outlier_indices = list(df.index[(df < lower_bound)|(df > upper_bound)])
    outlier_values = list(df[outlier_indices])
    
    df[outlier_indices] = np.NaN
    
    return df

In [None]:
# Find outliers in first column (continuous values)
print(find_outliers(X[1]))

In [None]:
# Find outliers in first column (continuous values)
print(find_outliers(X[2]))

In [None]:
# Find outliers in first column (continuous values)
print(find_outliers(X[7]))

In [None]:
# Find outliers in first column (continuous values)
print(find_outliers(X[10]))

In [None]:
# Find outliers in first column (continuous values)
print(find_outliers(X[13]))

In [None]:
# Find outliers in first column (continuous values)
print(find_outliers(X[14]))

In [None]:
# Check for null values
X.isnull().values.any()

In [None]:
# Define the values to replce and the strategy of choosing the replacement value
suspected_cols = [1, 2, 7, 10, 13, 14]
imp = Imputer(missing_values="NaN", strategy="mean")

pd.DataFrame(X)[suspected_cols] = imp.fit_transform(pd.DataFrame(X)[suspected_cols])
pd.DataFrame(X).head(10)

In [None]:
# Check for null values
pd.DataFrame(X).isnull().values.any()

### 9. Feature Engineering

In [None]:
# Select best features
select = sklearn.feature_selection.SelectKBest(k=20)
selected_features = select.fit(X, y_enc)
indexes = selected_features.get_support(indices=True)
col_names_selected = [pd.DataFrame(X).columns[i] for i in indexes]

X_selected = pd.DataFrame(X)[col_names_selected]
pd.DataFrame(X_selected).head(10)

### 10. Split our dataset into train and test datasets

#### Split non-preprocessed data

In [None]:
X_train_npp, X_test_npp, y_train_npp, y_test_npp = train_test_split(X_npp, y_enc,\
                                                    test_size=0.3, random_state=42)
print(X_train_npp.shape, y_train_npp.shape)
print(X_test_npp.shape, y_test_npp.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_enc,\
                                                    test_size=0.3, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

### 11. Scale our data

In [None]:
# Use StandardScaler
scaler = preprocessing.StandardScaler().fit(X_train, y_train)
X_train_scaled = scaler.transform(X_train)

pd.DataFrame(X_train_scaled, columns=pd.DataFrame(X_train).columns).head()

In [None]:
pd.DataFrame(y_train).head()

### 12. Start building a classifier

#### Logestic Regression on non-preprocessed data

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr_npp = LogisticRegression()
clf_lr_npp.fit(X_train_npp, y_train_npp)

#### Logestic Regression on preprocessed data

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
model = clf_lr.fit(X_train_scaled, y_train)
model

### 13. Evaluate our model

In [None]:
# Use the scaler fit on trained data to scale our test data
X_test_scaled = scaler.transform(X_test)
pd.DataFrame(X_test_scaled, columns=pd.DataFrame(X_train).columns).head()

#### Evaluate Logistic Regression on non-preprocessed data

In [None]:
y_score_lr_npp = clf_lr_npp.decision_function(X_test_npp)
y_score_lr_npp

In [None]:
# Get accuracy score
from sklearn.metrics import accuracy_score

y_pred_lr_npp = clf_lr_npp.predict(X_test_npp)
acc_lr_npp = accuracy_score(y_test_npp, y_pred_lr_npp)
print(acc_lr_npp)

In [None]:
# Get Precision vs. Recall score
from sklearn.metrics import average_precision_score

average_precision_lr_npp = average_precision_score(y_test_npp, y_score_lr_npp)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_lr_npp))

#### Evaluate Logistic Regression on preprocessed data

In [None]:
y_score_lr = clf_lr.decision_function(X_test_scaled)
y_score_lr

In [None]:
y_pred_lr = clf_lr.predict(X_test_scaled)
acc_lr = accuracy_score(y_test, y_pred_lr)
print(acc_lr)

In [None]:
average_precision_lr = average_precision_score(y_test, y_score_lr)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_lr))

### 14. ROC Curve and models comparisons

In [None]:
# Plot SVC ROC Curve
plt.figure(0, figsize=(15,10)).clf()

fpr_lr_npp, tpr_lr_npp, thresh_lr_npp = metrics.roc_curve(y_test_npp, y_score_lr_npp)
auc_lr_npp = metrics.roc_auc_score(y_test_npp, y_score_lr_npp)
plt.plot(fpr_lr_npp, tpr_lr_npp, label="Logistic Regression on Non-preprocessed Data, auc=" + str(auc_lr_npp))

fpr_lr, tpr_lr, thresh_lr = metrics.roc_curve(y_test, y_score_lr)
auc_lr = metrics.roc_auc_score(y_test, y_score_lr)
plt.plot(fpr_lr, tpr_lr, label="Logistic Regression on Preprocessed Data, auc=" + str(auc_lr))

plt.legend(loc=0)
plt.xlabel('False Positives')
plt.ylabel('True Positives')

#### Bonus: Deploy model on the cloud using IBM Watson Machine Learning

We have our model, but we want to use it through multiple apps. A solution is to deploy it on the cloud as an endpoint (url) and send data collected from a web/mobile app as a REST API call with data sent in the form of a JSON request.

In [None]:
# Insert your credentials here

wml_credentials = {

}

In [None]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient
client = WatsonMachineLearningAPIClient( wml_credentials )

In [None]:
print(wml_credentials['apikey'])

In [None]:
import requests

url     = "https://iam.bluemix.net/oidc/token"
headers = { "Content-Type" : "application/x-www-form-urlencoded" }
data    = "apikey=" + wml_credentials['apikey'] + "&grant_type=urn:ibm:params:oauth:grant-type:apikey"
IBM_cloud_IAM_uid = "bx"
IBM_cloud_IAM_pwd = "bx"
response  = requests.post( url, headers=headers, data=data, auth=( IBM_cloud_IAM_uid, IBM_cloud_IAM_pwd ) )
iam_token = response.json()["access_token"]

In [None]:
# Save your model

model_props = {client.repository.ModelMetaNames.AUTHOR_NAME: "Your name", 
               client.repository.ModelMetaNames.NAME: "Credit Card Approval Model"}

In [None]:
# Publish model in Watson Machine Learning repository on Cloud
published_model = client.repository.store_model(model=model, meta_props=model_props, \
                                                training_data=X_train_scaled, training_target=y_train)

In [None]:
# Create model deployment

published_model_uid = client.repository.get_model_uid(published_model)
created_deployment = client.deployments.create(published_model_uid, "Deployment of Credit Card Approval Model")

In [None]:
# Get Scoring URL
scoring_endpoint = client.deployments.get_scoring_url(created_deployment)

print(scoring_endpoint)

In [None]:
import json

# Get model details and expected input
model_details = client.repository.get_details(published_model_uid)
print(json.dumps(model_details, indent=2))

### Sending data to the model
Sending new data (may be collected from web/mobile app) in the format the model is excpecting as shown above.
We get back a response with the predicted class (0 - Credit Card Application will be rejected)
and probabilities of both classes (0 or Application Rejection has a probability of 1 which is very high, 1 or Application Acceptance has a probability of 5.096701256722081e-98 which is very low. This gives us an idea about the model's confidence of its predictions.

![postman](https://github.com/HebaNAS/IBM-Watson-Studio-Enablement/blob/master/02-CreditCardApprovalModel/imgs/API-Call.jpg?raw=true)

## References:

#### <a name="first" id="first"></a><sub>[1] https://www.sciencedirect.com/science/article/abs/pii/S0148296318301231 "Customer churn prediction in telecommunication industry using data certainty"</sub>  
#### <a name="second" id="second"></a><sub>[2] https://www.signal.co/blog/understanding-customer-churn/ "10 Stats Expose the Real Connection Between Customer Experience and Customer Churn"</sub>  
#### <a name="third" id="third"></a><sub>[3] https://www.pinterest.com/pin/456904324667676431/ "Mobile Telco Churn Infographic"</sub>  
#### <sub>[4] https://pandas.pydata.org/pandas-docs/stable/ "Pandas Documentation"</sub>  
#### <sub>[5] http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html "Scikit-Learn Imputer"</sub>  
#### <sub>[6] https://github.com/ibm-watson-data-lab/pixiedust/wiki/Tutorial:-Extending-the-PixieDust-Visualization "PixieDust Documentation"</sub>
#### <sub>[7] https://seaborn.pydata.org/ "Seaborn Documentation"</sub>
#### <sub>[8] http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder "Scikit-Learn LabelEncoder"</sub>
#### <sub>[9] http://colingorrie.github.io/outlier-detection.html "Outlier Detection Methods"</sub>
#### <sub>[10] http://scikit-learn.org/stable/auto_examples/linear_model/plot_polynomial_interpolation.html#sphx-glr-auto-examples-linear-model-plot-polynomial-interpolation-py "Scikit-Learn Polynomial"</sub>
#### <sub>[11] http://scikit-learn.org/stable/modules/feature_selection.html "Scikit-Learn Feature Selection"</sub>
#### <sub>[12] http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler "Scikit-Learn StandardScaler"</sub>
#### <sub>[13] http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC "Scikit-Learn SVC"</sub>
#### <sub>[14] http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression "Scikit-Learn Logistic Regression"</sub>
#### <sub>[15] http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html "Scikit-Learn MLP Classifier"</sub>
#### <sub>[16] http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score "Scikit-Learn Accuracy Score"</sub>
#### <sub>[17] http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score "Scikit-Learn Average Precision Score"</sub>
#### <sub>[18] https://www.sciencedirect.com/science/article/pii/S016786550500303X "An introduction to ROC analysis"</sub>
#### <sub>[19] https://wml-api-pyclient.mybluemix.net/ "Watson Machine Learning Client Documentation"</sub>
#### <sub>[20] https://dataplatform.ibm.com/docs/content/analyze-data/ml-deploy-notebook.html?context=analytics "IBM Watson Studio Documentation-Deploy a model from a notebook"</sub>