In [34]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'Chapter6/Credit_Card.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
credit = pd.read_csv(file_content_stream)
credit = credit.drop(columns = ['ID'], axis = 1)
credit.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [19]:
credit.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')

In [20]:
credit.shape

(30000, 24)

In [21]:
credit['default.payment.next.month'].value_counts() / credit.shape[0]

0    0.7788
1    0.2212
Name: default.payment.next.month, dtype: float64

## Splitting the data

In [22]:
## Defining the input and target variables
X = credit.drop(columns = ['default.payment.next.month'], axis = 1)
Y = credit['default.payment.next.month']

## Splitting the data into train and validation
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Splitting into validation and test
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.5, stratify = Y_val)

## Standardizing the data

In [23]:
## Loading minmax
scaler = MinMaxScaler()

## Transforming to 0-1 the inputs 
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)
X_test = scaler.fit_transform(X_test)

## Buiding the base learners

In [29]:
## Decision tree
tree = DecisionTreeClassifier(max_depth = 3).fit(X_train, Y_train)
pred_tree = tree.predict_proba(X_val)[:, 1]

## Nearest-Neighbors
knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, Y_train)
pred_knn = knn.predict_proba(X_val)[:, 1]

## support vector machine 
svm = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)
pred_svm = svm.predict_proba(X_val)[:, 1]

## Creating the new inputs and target

In [30]:
## Creating a data-frame with the base learners predictions
X_preds = pd.DataFrame({'tree': pred_tree, 'knn': pred_knn, 'svm': pred_svm})
X_preds.head()

Unnamed: 0,tree,knn,svm
0,0.11451,0.0,0.160942
1,0.11451,0.0,0.181479
2,0.11451,0.2,0.171468
3,0.215915,0.2,0.178212
4,0.215915,0.4,0.210031


## Building the meta-learner

In [31]:
## building the meta-learner
meta_learner = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_preds, Y_val)

## Predicting on the test datasets with base learners

In [33]:
## Predicting on the test datasets
test_pred_tree = tree.predict_proba(X_test)[:, 1]
test_pred_knn = knn.predict_proba(X_test)[:, 1]
test_pred_svm = svm.predict_proba(X_test)[:, 1]

## Creating a data-frame with base learners 
X_test_preds = pd.DataFrame({'tree': test_pred_tree, 'knn': test_pred_knn, 'svm': test_pred_svm})
X_test_preds.head()

Unnamed: 0,tree,knn,svm
0,0.11451,0.0,0.173227
1,0.11451,0.2,0.191382
2,0.11451,0.0,0.171401
3,0.11451,0.2,0.13862
4,0.11451,0.2,0.186879


In [36]:
## Changing likelihoods to labels
test_pred_tree_label = np.where(test_pred_tree < 0.25, 0, 1) 
test_pred_knn_label = np.where(test_pred_knn < 0.25, 0, 1)
test_pred_svm_label = np.where(test_pred_svm < 0.25, 0, 1)

## Classification report of decision tree
print(classification_report(Y_test, test_pred_tree_label))

## Classification report of knn
print(classification_report(Y_test, test_pred_knn_label))

## Classification report of svm
print(classification_report(Y_test, test_pred_svm_label))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      2337
           1       0.57      0.46      0.51       663

    accuracy                           0.80      3000
   macro avg       0.71      0.68      0.69      3000
weighted avg       0.79      0.80      0.80      3000

              precision    recall  f1-score   support

           0       0.85      0.78      0.81      2337
           1       0.40      0.52      0.45       663

    accuracy                           0.72      3000
   macro avg       0.63      0.65      0.63      3000
weighted avg       0.75      0.72      0.73      3000

              precision    recall  f1-score   support

           0       0.85      0.92      0.88      2337
           1       0.59      0.42      0.49       663

    accuracy                           0.81      3000
   macro avg       0.72      0.67      0.68      3000
weighted avg       0.79      0.81      0.79      3000



## Predicting on the test with meta-learner

In [37]:
## using the meta-learner to make predictions on test
meta_learner_pred = meta_learner.predict_proba(X_test_preds)[:, 1]

## Changing likelihood to labels
meta_learner_pred_label = np.where(meta_learner_pred < 0.25, 0, 1)

## Classification report of meta-learner
print(classification_report(Y_test, meta_learner_pred_label))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2337
           1       0.55      0.49      0.52       663

    accuracy                           0.80      3000
   macro avg       0.70      0.69      0.70      3000
weighted avg       0.79      0.80      0.79      3000

