In [1]:
#Checking where your current working directory is
import os
os.getcwd()

'/home/isayapin'

In [2]:
#Change your working directory to where your 'ccdef.csv' dataset is
os.chdir('/home/isayapin/Desktop/Bootcamp_3')

In [3]:
#Check that the directory is changed correctly
os.getcwd()

'/home/isayapin/Desktop/Bootcamp_3'

In [4]:
#Importing the main libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
# If there is any import error, please run the following 4 lines without the # sign
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pipinstall sklearn

In [6]:
#Importing the dataset
dataset = pd.read_csv('ccdef.csv', index_col = 'ID')

In [7]:
dataset.head()

Unnamed: 0_level_0,default.payment.next.month,SEX,EDUCATION,MARRIAGE,AGE,LIMIT_BAL,PAY_1,PAY_3,PAY_5,BILL_AMT1,BILL_AMT3,BILL_AMT5,PAY_AMT1,PAY_AMT3,PAY_AMT5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,1,Female,University,1,24,20000.0,2,-1,-2,3913.0,689.0,0.0,0.0,0.0,0.0
1,1,Female,University,0,26,120000.0,-1,0,0,2682.0,2682.0,3455.0,0.0,1000.0,0.0
2,0,Female,University,0,34,90000.0,0,0,0,29239.0,13559.0,14948.0,1518.0,1000.0,1000.0
3,0,Female,University,1,37,50000.0,0,0,0,46990.0,49291.0,28959.0,2000.0,1200.0,1069.0
4,0,Male,University,1,57,50000.0,-1,-1,0,8617.0,35835.0,19146.0,2000.0,10000.0,689.0


In [8]:
#default.payment.next.month: Default payment (1=yes, 0=no)
#SEX: {Male, Female}
#EDUCATION: {Graduate School, University, High School, Others, Unknown}
#MARRIAGE: Marital status (1=married, 0=single)
#AGE: Age in years

#LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
#PAY_1: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 
#2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
#PAY_3: Repayment status in July, 2005 (scale same as above)
#PAY_5: Repayment status in May, 2005 (scale same as above)

#BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
#BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
#BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)

#PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
#PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
#PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)

In [9]:
#Dealing with the dummy variables
#drop_first = True to avoid dummy variable trap
dataset = pd.get_dummies(dataset, columns = ['SEX', 'EDUCATION'], drop_first = True)
dataset.head()

Unnamed: 0_level_0,default.payment.next.month,MARRIAGE,AGE,LIMIT_BAL,PAY_1,PAY_3,PAY_5,BILL_AMT1,BILL_AMT3,BILL_AMT5,PAY_AMT1,PAY_AMT3,PAY_AMT5,SEX_Male,EDUCATION_High School,EDUCATION_Other,EDUCATION_University,EDUCATION_Unknown
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1,1,24,20000.0,2,-1,-2,3913.0,689.0,0.0,0.0,0.0,0.0,0,0,0,1,0
1,1,0,26,120000.0,-1,0,0,2682.0,2682.0,3455.0,0.0,1000.0,0.0,0,0,0,1,0
2,0,0,34,90000.0,0,0,0,29239.0,13559.0,14948.0,1518.0,1000.0,1000.0,0,0,0,1,0
3,0,1,37,50000.0,0,0,0,46990.0,49291.0,28959.0,2000.0,1200.0,1069.0,0,0,0,1,0
4,0,1,57,50000.0,-1,-1,0,8617.0,35835.0,19146.0,2000.0,10000.0,689.0,1,0,0,1,0


In [10]:
#Bucketing the age groups

#'Young': 21-33 y.o.
#'Adults': 34-45 y.o.
#'Mature': 46-60 y.0.
#'Elderly': 61+ y.o.

In [11]:
dataset['AGE'].describe()

count    15000.000000
mean        35.403733
std          9.301267
min         21.000000
25%         28.000000
50%         34.000000
75%         41.000000
max         75.000000
Name: AGE, dtype: float64

In [12]:
#Splitting into age groups 
#Drop one group to avoid dummy variable trap
dataset['young'] = ((dataset['AGE'] >= 21) & (dataset['AGE'] <= 35)).astype('int')
dataset['adults'] = ((dataset['AGE'] >= 34) & (dataset['AGE'] <= 45)).astype('int')
dataset['mature'] = ((dataset['AGE'] >= 46) & (dataset['AGE'] <= 60)).astype('int')
del dataset['AGE']

In [13]:
dataset.head()

Unnamed: 0_level_0,default.payment.next.month,MARRIAGE,LIMIT_BAL,PAY_1,PAY_3,PAY_5,BILL_AMT1,BILL_AMT3,BILL_AMT5,PAY_AMT1,PAY_AMT3,PAY_AMT5,SEX_Male,EDUCATION_High School,EDUCATION_Other,EDUCATION_University,EDUCATION_Unknown,young,adults,mature
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1,1,20000.0,2,-1,-2,3913.0,689.0,0.0,0.0,0.0,0.0,0,0,0,1,0,1,0,0
1,1,0,120000.0,-1,0,0,2682.0,2682.0,3455.0,0.0,1000.0,0.0,0,0,0,1,0,1,0,0
2,0,0,90000.0,0,0,0,29239.0,13559.0,14948.0,1518.0,1000.0,1000.0,0,0,0,1,0,1,1,0
3,0,1,50000.0,0,0,0,46990.0,49291.0,28959.0,2000.0,1200.0,1069.0,0,0,0,1,0,0,1,0
4,0,1,50000.0,-1,-1,0,8617.0,35835.0,19146.0,2000.0,10000.0,689.0,1,0,0,1,0,0,0,1


In [14]:
#Splitting data into dependent and independent variables 
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [15]:
print(dataset.shape)

(15000, 20)


In [16]:
print(X.shape)
print(y.shape)

(15000, 19)
(15000,)


In [17]:
#Splitting data into training and testing (validation) sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [18]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
#Fitting data to Support Vector Classifier (SVC)
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [20]:
#Evaluating the performance of the classifier
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print('Accuracy = %.3f' % accuracy_score(y_pred, y_test))

Accuracy = 0.772


In [21]:
#Is this accuracy good? Let's find out
np.unique(y_pred, return_counts=True)

(array([0]), array([3000]))

In [22]:
#Look at how the classifier predicted 
print('Confusion Matrix = \n', confusion_matrix(y_pred, y_test))
print('F_1 Score = %.3f' % f1_score(y_pred, y_test))

Confusion Matrix = 
 [[2317  683]
 [   0    0]]
F_1 Score = 0.000


  'recall', 'true', average, warn_for)


In [23]:
#Time to improve our model
#Fitting data to Support Vector Classifier (SVC)
classifier = SVC(kernel = 'poly', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [24]:
#Evaluating the performance of the classifier
y_pred = classifier.predict(X_test)
print('Accuracy = %.3f' % accuracy_score(y_pred, y_test))

Accuracy = 0.788


In [25]:
#Look at how the classifier predicted 
print('Confusion Matrix = \n', confusion_matrix(y_pred, y_test))
print('F_1 Score = %.3f' % f1_score(y_pred, y_test))

Confusion Matrix = 
 [[2268  586]
 [  49   97]]
F_1 Score = 0.234


In [26]:
#Slightly better, but can still be improved
#Fitting data to Support Vector Classifier (SVC)
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [27]:
#Evaluating the performance of the classifier
y_pred = classifier.predict(X_test)
print('Accuracy = %.3f' % accuracy_score(y_pred, y_test))

Accuracy = 0.811


In [28]:
#Look at how the classifier predicted 
print('Confusion Matrix = \n', confusion_matrix(y_pred, y_test))
print('F_1 Score = %.3f' % f1_score(y_pred, y_test))

Confusion Matrix = 
 [[2238  487]
 [  79  196]]
F_1 Score = 0.409


In [29]:
#Hyperparameter tuning using Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 2, 5, 10], 'gamma': ['auto', 'scale']}]

In [30]:
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'f1', cv = 5, n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)



In [35]:
print('Best Parameters = %s' % grid_search.best_params_)

Best Parameters = {'C': 2, 'gamma': 'auto'}


In [36]:
#Fit the model with the improved hyperparameters
classifier = SVC(kernel = 'rbf', C = 2, gamma = 'auto', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
#Evaluating the performance of the classifier
y_pred = classifier.predict(X_test)
print('Accuracy = %.3f' % accuracy_score(y_pred, y_test))

Accuracy = 0.812


In [38]:
#Look at how the classifier predicted 
print('Confusion Matrix = \n', confusion_matrix(y_pred, y_test))
print('F_1 Score = %.3f' % f1_score(y_pred, y_test))

Confusion Matrix = 
 [[2238  485]
 [  79  198]]
F_1 Score = 0.413
