In [71]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import OrderedDict
import seaborn as sns
from pandas.tools import plotting
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [72]:
train = pd.read_csv('training.csv')
test = pd.read_csv('testing.csv')

In [73]:
train.head()


Unnamed: 0,ID,LIMIT_BAL,MARRIAGE,EDUCATION,SEX,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,TARGET
0,1,30000.0,1,1,2,40,24607.0,24430.0,23881.0,0
1,2,20000.0,1,1,2,41,17040.0,16474.0,17203.0,0
2,3,20000.0,1,1,2,36,17332.0,18600.0,17947.0,0
3,4,20000.0,1,1,2,35,2003.0,2948.0,3372.0,0
4,5,170000.0,1,1,2,38,171465.0,145196.0,110163.0,0


In [74]:
train = pd.get_dummies(train,columns = ["MARRIAGE","EDUCATION","SEX"], drop_first=True)

In [75]:
train

Unnamed: 0,ID,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,TARGET,MARRIAGE_2,MARRIAGE_3,EDUCATION_2,EDUCATION_3,EDUCATION_4,SEX_2
0,1,30000.0,40,24607.0,24430.0,23881.0,0,0,0,0,0,0,1
1,2,20000.0,41,17040.0,16474.0,17203.0,0,0,0,0,0,0,1
2,3,20000.0,36,17332.0,18600.0,17947.0,0,0,0,0,0,0,1
3,4,20000.0,35,2003.0,2948.0,3372.0,0,0,0,0,0,0,1
4,5,170000.0,38,171465.0,145196.0,110163.0,0,0,0,0,0,0,1
5,6,20000.0,40,16500.0,17012.0,18181.0,0,0,0,0,0,0,1
6,7,200000.0,32,157222.0,160320.0,162140.0,0,0,0,0,0,0,1
7,8,370000.0,37,47269.0,43531.0,59801.0,0,0,0,0,0,0,1
8,9,160000.0,39,106643.0,89397.0,90348.0,0,0,0,0,0,0,1
9,10,30000.0,36,25826.0,27274.0,25744.0,0,0,0,0,0,0,1


### TRAIN TEST

In [6]:
X = train.drop(['ID','TARGET',''],axis =1)
Y = train['TARGET']

In [7]:
train_X, test_X, train_y, test_y = train_test_split(X,Y,test_size = 0.2, random_state = 123)

### Modelling

In [8]:
logreg = LogisticRegression(random_state=123)
logreg.fit(train_X, train_y)
logregpred = logreg.predict(test_X)

In [9]:
cm = confusion_matrix(test_y,logregpred)
cm

array([[1463,  589],
       [ 631,  642]], dtype=int64)

In [10]:
accuracy = accuracy_score(test_y,logregpred)

In [15]:
coef = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(np.transpose(logreg.coef_))], axis = 1)
logreg.intercept_
coef

Unnamed: 0,0,0.1
0,LIMIT_BAL,-4.668152e-06
1,AGE,0.009333653
2,BILL_AMT1,-2.488037e-06
3,BILL_AMT2,8.40616e-07
4,BILL_AMT3,1.434783e-06
5,MARRIAGE_2,-0.0001579935
6,MARRIAGE_3,3.252801e-09
7,EDUCATION_2,2.190826e-05
8,EDUCATION_3,0.000158184
9,EDUCATION_4,5.500223e-06


In [12]:
sensitivity = cm[1,1] / (cm[1,0] + cm[1,1])
specificity = cm[0,0] / (cm[0,0] + cm[0,1])

In [16]:
print('Confusion Matrix : \n', cm)
print ('Accuracy :', accuracy)
print('Sensitivy :', sensitivity)
print('Specificity :', specificity)

Confusion Matrix : 
 [[1463  589]
 [ 631  642]]
Accuracy : 0.6330827067669172
Sensitivy : 0.5043205027494109
Specificity : 0.7129629629629629


# DECISION TREE

In [40]:
decisiontree = DecisionTreeClassifier(random_state=123)
decisiontree.fit(train_X, train_y)
decisiontreepred = decisiontree.predict(train_X)

In [41]:
accuracy1 = accuracy_score(train_y,decisiontreepred)

In [42]:
cm1 = confusion_matrix(train_y,decisiontreepred)
cm
sensitivity1 = cm1[1,1] / (cm1[1,0] + cm1[1,1])
specificity1 = cm1[0,0] / (cm1[0,0] + cm1[0,1])

In [43]:
print('Confusion Matrix : \n', cm1)
print ('Accuracy :', accuracy1)
print('Sensitivy :', sensitivity1)
print('Specificity :', specificity1)

Confusion Matrix : 
 [[7936    1]
 [   8 5355]]
Accuracy : 0.9993233082706767
Sensitivy : 0.9985082975946299
Specificity : 0.9998740078115157


overfitting terlihat dari akurasi 0.99

In [21]:
features = train_X.columns

In [22]:
export_graphviz(decisiontree, out_file='D:/dt.dot', feature_names=features, class_names=['0','1'])

In [52]:
decisiontree = DecisionTreeClassifier(random_state=123)
decisiontree.fit(train_X, train_y)
decisiontreepred = decisiontree.predict(test_X)

accuracy1 = accuracy_score(test_y,decisiontreepred)
cm1 = confusion_matrix(test_y,decisiontreepred)
cm
sensitivity1 = cm1[1,1] / (cm1[1,0] + cm1[1,1])
specificity1 = cm1[0,0] / (cm1[0,0] + cm1[0,1])
print('Confusion Matrix : \n', cm1)
print ('Accuracy :', accuracy1)
print('Sensitivy :', sensitivity1)
print('Specificity :', specificity1)

Confusion Matrix : 
 [[1716  336]
 [ 292  981]]
Accuracy : 0.8111278195488721
Sensitivy : 0.7706205813040062
Specificity : 0.8362573099415205


### DT dengan set minimum leaf

decisiontree = DecisionTreeClassifier(min_samples_split=200, min_samples_leaf=70,random_state=123)
decisiontree.fit(train_X, train_y)
decisiontreepred = decisiontree.predict(test_X)

In [91]:
accuracy1 = accuracy_score(test_y,decisiontreepred)
cm1 = confusion_matrix(test_y,decisiontreepred)
cm
sensitivity1 = cm1[1,1] / (cm1[1,0] + cm1[1,1])
specificity1 = cm1[0,0] / (cm1[0,0] + cm1[0,1])

In [92]:
print('Confusion Matrix : \n', cm1)
print ('Accuracy :', accuracy1)
print('Sensitivy :', sensitivity1)
print('Specificity :', specificity1)

Confusion Matrix : 
 [[1715  337]
 [ 241 1032]]
Accuracy : 0.8261654135338345
Sensitivy : 0.8106834249803614
Specificity : 0.8357699805068226


## Feature Engineering

In [93]:
test = pd.get_dummies(test,columns = ["MARRIAGE","EDUCATION","SEX"], drop_first=True)

ValueError: labels ['MARRIAGE' 'EDUCATION' 'SEX'] not contained in axis

In [94]:
xtesting = test.drop('ID',1)

### Probability

In [95]:
dtpred = decisiontree.predict_proba(xtesting)
dtpred = pd.DataFrame(dtpred)
dtpred.columns = ['Prob0','Prob1']
dtpred.head()


Unnamed: 0,Prob0,Prob1
0,0.952381,0.047619
1,1.0,0.0
2,1.0,0.0
3,0.971429,0.028571
4,1.0,0.0


probabilitas digunakan untuk melihat ID mana yg memiliki prob tinggi untuk telat bayar. shg yg memiliki probabilitas tinggi adalah yg akan

### Class predicition

In [101]:
dtpred1 = decisiontree.predict(xtesting)
dtpred1 = pd.DataFrame(dtpred1)
dtpred1.columns = ['Class']
dtpred1.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


In [102]:
mypredict = pd.concat([test,dtpred,dtpred1], axis=1)

In [104]:
mypredict.head()

Unnamed: 0,ID,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,MARRIAGE_2,MARRIAGE_3,EDUCATION_2,EDUCATION_3,EDUCATION_4,SEX_2,Prob0,Prob1,Class
0,1,30000.0,40,24430.0,23881.0,24234.0,0,0,0,0,0,1,0.952381,0.047619,0
1,2,20000.0,41,16474.0,17203.0,18411.0,0,0,0,0,0,1,1.0,0.0,0
2,3,20000.0,36,18600.0,17947.0,18758.0,0,0,0,0,0,1,1.0,0.0,0
3,4,20000.0,35,2948.0,3372.0,0.0,0,0,0,0,0,1,0.971429,0.028571,0
4,5,170000.0,38,145196.0,110163.0,112157.0,0,0,0,0,0,1,1.0,0.0,0


In [106]:
mypredict.to_csv('C:/Users/nourma059258/Documents/astra/06-06-18 Supervised Learning/mypredict.csv', index=False, header=True)