In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, scale
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics


%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

In [3]:
def preprocessing_data(data, unique_home_ownership=None, unique_verification=None, dict_grade=None,dict_sub_grade=None, dict_emp_title=None, dict_pymnt_plan=None, dict_purpose=None):
    numeric_data = data.drop(['record_id', 'earliest_cr_line', 'issue_d', 'addr_state', 'zip_code'], axis=1)
    
    numeric_data['term'] = numeric_data['term'].map({' 36 months': 0, ' 60 months': 1})
    numeric_data['application_type'] = numeric_data['application_type'].map({'INDIVIDUAL': 0, 'JOINT': 1})
    numeric_data['initial_list_status'] = numeric_data['initial_list_status'].map({'f': 0, 'w': 1})
    numeric_data['emp_length'] = numeric_data['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3,  '3 years': 4,  '4 years': 5,  '5 years': 6,  '6 years': 7,  '7 years': 8,  '8 years': 9,  '9 years': 10,  '10+ years': 11})

    numeric_data['emp_length'].fillna(0, inplace=True)
    numeric_data['emp_title'].fillna('0', inplace=True)
    numeric_data['mths_since_last_delinq'].fillna(numeric_data['mths_since_last_delinq'].notnull().min(), inplace=True)
    numeric_data['collections_12_mths_ex_med'].fillna(numeric_data['collections_12_mths_ex_med'].notnull().max(), inplace=True)
    numeric_data['revol_util'].fillna(numeric_data['revol_util'].notnull().mean(), inplace=True)
    numeric_data['tot_coll_amt'].fillna(numeric_data['tot_coll_amt'].notnull().min(), inplace=True)
    numeric_data['tot_cur_bal'].fillna(numeric_data['tot_cur_bal'].notnull().min(), inplace=True)
    numeric_data['total_rev_hi_lim'].fillna(numeric_data['total_rev_hi_lim'].notnull().min(), inplace=True)
  
    home_ownership_list = numeric_data.home_ownership.values.reshape(-1, 1)

    numeric_data = pd.concat((numeric_data,pd.get_dummies(numeric_data.home_ownership)),1)
    
    if unique_home_ownership is None:
        unique_home_ownership = numeric_data['home_ownership'].unique()
        
    for i in unique_home_ownership:
        try:
            numeric_data[i]
        except KeyError:
            numeric_data[i] = 0
    
    verification_list = numeric_data.verification_status.values.reshape(-1, 1)

    numeric_data = pd.concat((numeric_data,pd.get_dummies(numeric_data.verification_status)),1)
    
    if unique_verification is None:
        unique_verification = numeric_data['verification_status'].unique()
        
    for i in unique_verification:
        try:
            numeric_data[i]
        except KeyError:
            numeric_data[i] = 0
    
    le = LabelEncoder()

    if dict_grade is None:
        le.fit(numeric_data.grade.astype(str))
        numeric_data['grade_le'] = le.transform((numeric_data['grade'].values))
        dict_grade = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['grade_le'] = numeric_data['grade'].map(dict_grade).fillna(-1)  

    if dict_sub_grade is None:
        le.fit(numeric_data.sub_grade.astype(str))
        numeric_data['sub_grade_le'] = le.transform(numeric_data['sub_grade'].values)
        dict_sub_grade = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['sub_grade_le'] = numeric_data['sub_grade'].map(dict_sub_grade).fillna(-1)  
    
    if dict_emp_title is None:
        le.fit(numeric_data.emp_title.astype(str))
        numeric_data['emp_title_le'] = le.transform(numeric_data['emp_title'].values)
        dict_emp_title = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['emp_title_le'] = numeric_data['emp_title'].map(dict_emp_title).fillna(-1)        

    if dict_pymnt_plan is None:
        le.fit(numeric_data.pymnt_plan.astype(str))
        numeric_data['pymnt_plan_le'] = le.transform(numeric_data['pymnt_plan'].values)
        dict_pymnt_plan = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['pymnt_plan_le'] = numeric_data['pymnt_plan'].map(dict_pymnt_plan).fillna(0)  
    
    if dict_purpose is None:
        le.fit(numeric_data.purpose.astype(str))
        numeric_data['purpose_le'] = le.transform(numeric_data['purpose'].values)
        dict_purpose = dict(zip(le.classes_, le.transform(le.classes_)))
#         dict_purpose[None] = 5
    else:
        numeric_data['purpose_le'] = numeric_data['purpose'].map(dict_purpose).fillna(-1) 
#     print(dict_grade)

    numeric_data = numeric_data.drop(['grade', 'sub_grade', 'purpose', 'home_ownership','emp_title', 'pymnt_plan',  'verification_status'], axis=1)
    
    return numeric_data, unique_home_ownership, unique_verification, dict_grade, dict_sub_grade, dict_emp_title, dict_pymnt_plan, dict_purpose

In [4]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [23]:
train

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,...,OWN,RENT,Not Verified,Source Verified,Verified,grade_le,sub_grade_le,emp_title_le,pymnt_plan_le,purpose_le
193136,20475.0,0,13.11,690.97,1.0,75000.0,0,26.85,0.0,2.0,...,0,0,0,1,0,1,8,62644,0,1
27784,12000.0,0,15.99,421.83,11.0,73000.0,1,16.47,0.0,2.0,...,0,0,0,0,1,3,16,8070,0,2
157132,16000.0,0,8.90,508.06,11.0,200000.0,1,13.50,0.0,2.0,...,0,0,0,0,1,0,4,71617,0,2
81962,12000.0,0,15.33,417.94,11.0,100000.0,1,20.48,1.0,3.0,...,0,0,1,0,0,3,17,78835,0,2
161,10000.0,0,14.16,342.56,11.0,65000.0,0,14.00,0.0,2.0,...,0,1,0,0,1,2,11,68800,0,2
14415,25375.0,0,14.99,879.51,1.0,105000.0,1,35.20,0.0,3.0,...,0,0,0,0,1,2,14,2240,0,1
116818,19750.0,1,15.80,478.19,2.0,45000.0,1,21.79,0.0,0.0,...,0,0,0,0,1,2,12,4451,0,1
53527,28000.0,1,23.50,797.40,10.0,92000.0,1,17.19,1.0,1.0,...,0,0,0,1,0,5,25,66663,0,1
40783,20000.0,0,8.90,635.07,11.0,85000.0,1,8.37,0.0,3.0,...,0,0,0,0,1,0,4,22154,0,2
174247,19200.0,0,12.12,638.82,11.0,45000.0,1,9.63,0.0,0.0,...,0,1,0,0,1,1,7,16945,0,2


In [5]:
train, unique_home_ownership, unique_verification, dict_grade, dict_sub_grade, dict_emp_title, dict_pymnt_plan, dict_purpose = preprocessing_data(train)
test = preprocessing_data(test,unique_home_ownership, unique_verification, dict_grade, dict_sub_grade, dict_emp_title, dict_pymnt_plan, dict_purpose)[0]

In [3]:
# new_data = pd.DataFrame()

# new_data['initial_list_status'] = data['initial_list_status'].map({'f': 0, 'w': 1})
# new_data['dti'] = data['dti']
# new_data['int_rate'] = data['int_rate']
# new_data['emp_length'] = data['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3,  '3 years': 4,  '4 years': 5,  '5 years': 6,  '6 years': 7,  '7 years': 8,  '8 years': 9,  '9 years': 10,  '10+ years': 11})
# new_data['emp_length'].fillna(0, inplace=True)

# new_data['tot_cur_bal'] = data['tot_cur_bal']
# new_data['tot_cur_bal'].fillna(new_data['tot_cur_bal'].notnull().min(), inplace=True)
# new_data['loan_amnt'] = data['loan_amnt']
# new_data['loan_status'] = data['loan_status']

In [6]:
# X = new_data
X_train = train.drop(['loan_status'], axis=1)
y_train = train['loan_status']

X_test = test.drop(['loan_status'], axis=1)
y_test = test['loan_status']

# X = new_data.drop(['loan_status'], axis=1)
# y = new_data['loan_status']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train = scale(X_train)
X_test = scale(X_test)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [None]:
# [[list(X_train[i][1:3]) + list(X_train[i][1:3])] for i in range(len(X_train))]
# [[list(X_train[i][0:6])] for i in range(len(X_train))]

In [27]:
from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=3, random_state=32)
preds_train = kmeans.fit_predict([list(X_train[i][0:6]) for i in range(len(X_train))])

# preds_train = np.array(preds_train).reshape(int(X_train.size / 7), 1)

# X_train = np.hstack((X_train, preds_train))

In [None]:
preds_train.__len__()

In [None]:
train_0 = np.array([np.zeros(37)])
train_1 = np.array([np.zeros(37)])
train_2 = np.array([np.zeros(37)])
# train_3 = np.array([])
train_0_y = []
train_1_y = []
train_2_y = []
# train_3_y = []

y_train = list(y_train)
i = 0
# print(list(y_train)[0])
for cluster in preds_train:
    
    if cluster == 0:
#         train_0 = np.append(train_0, list(X_train[i]))
        train_0 = np.vstack((train_0, X_train[i]))
        train_0_y.append(y_train[i])
    elif cluster == 1:
#         train_1 = np.append(train_1, list(X_train[i]))
        train_1 = np.vstack((train_1, X_train[i]))
        train_1_y.append(y_train[i])
    elif cluster == 2:
#         train_2 = np.append(train_2, list(X_train[i]))
        train_2 = np.vstack((train_2, X_train[i]))
        train_2_y.append(y_train[i])
#     elif cluster == 3:
#         train_3 = np.append(train_3, list(X_train[i]))      
#         train_3_y.append(y_train[i])

    i += 1
#     print(i)
    
train_0 = np.delete(train_0, (0), axis=0)
train_1 = np.delete(train_1, (0), axis=0)
train_2 = np.delete(train_2, (0), axis=0)
# train_0 = train_0.reshape(int(train_0.size / 6), 6)
# train_1 = train_1.reshape(int(train_1.size / 6), 6)
# train_2 = train_2.reshape(int(train_2.size / 6), 6)
# train_3 = train_3.reshape(int(train_3.size / 6), 6)

# print(train_0)

In [48]:
print(len(train_0))
print(len(train_1))
print(len(train_2))

28380
80309
31443


In [49]:
TRAIN_train_0, TEST_train_0, TRAIN_train_0_y, TEST_train_0_y = train_test_split(train_0, train_0_y, test_size=0.3, random_state=42)
TRAIN_train_1, TEST_train_1, TRAIN_train_1_y, TEST_train_1_y = train_test_split(train_1, train_1_y, test_size=0.3, random_state=42)
TRAIN_train_2, TEST_train_2, TRAIN_train_2_y, TEST_train_2_y = train_test_split(train_2, train_2_y, test_size=0.3, random_state=42)
# TRAIN_train_3, TEST_train_3, TRAIN_train_3_y, TEST_train_3_y = train_test_split(train_3, train_3_y, test_size=0.3, random_state=42)

In [52]:
import xgboost as xgb

cls_0 = xgb.XGBClassifier()
cls_0.fit(TRAIN_train_0, TRAIN_train_0_y)
preds = cls_0.predict(TEST_train_0)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_0_y, preds)))

print('Recall score: ' + str(metrics.recall_score(TEST_train_0_y, preds)))

print('Precision score: ' + str(metrics.precision_score(TEST_train_0_y, preds)))

preds = cls_0.predict_proba(TEST_train_0)
preds = preds[:,1]
fpr, tpr, _ = metrics.roc_curve(TEST_train_0_y, preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.8221752407798919
Recall score: 0.9951345163136806
Precision score: 0.8245198008062604
Auc score: 0.7005727755725788


In [54]:
cls_1 = xgb.XGBClassifier()
cls_1.fit(TRAIN_train_1, TRAIN_train_1_y)
preds = cls_1.predict(TEST_train_1)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_1_y, preds)))

print('Recall score: ' + str(metrics.recall_score(TEST_train_1_y, preds)))

print('Precision score: ' + str(metrics.precision_score(TEST_train_1_y, preds)))

preds = cls_0.predict_proba(TEST_train_1)
preds = preds[:,1]
fpr, tpr, _ = metrics.roc_curve(TEST_train_1_y, preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.8173743410949238
Recall score: 0.9948660600823463
Precision score: 0.8199070001256755
Auc score: 0.6832725411151053


In [55]:
cls_2 = xgb.XGBClassifier()
cls_2.fit(TRAIN_train_2, TRAIN_train_2_y)
preds = cls_2.predict(TEST_train_2)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_2_y, preds)))

print('Recall score: ' + str(metrics.recall_score(TEST_train_2_y, preds)))

print('Precision score: ' + str(metrics.precision_score(TEST_train_2_y, preds)))

preds = cls_0.predict_proba(TEST_train_2)
preds = preds[:,1]
fpr, tpr, _ = metrics.roc_curve(TEST_train_2_y, preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.6604473656312944
Recall score: 0.8962233169129721
Precision score: 0.6797857765599701
Auc score: 0.6455048831295647


In [15]:
# cls_3 = xgb.XGBClassifier()
# cls_3.fit(TRAIN_train_3, TRAIN_train_3_y)
# preds = cls_3.predict(TEST_train_3)
# print('XGB: ')
# print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_3_y, preds)))

# print('Recall score: ' + str(metrics.recall_score(TEST_train_3_y, preds)))

# print('Precision score: ' + str(metrics.precision_score(TEST_train_3_y, preds)))

# print('Auc score: ' + str(metrics.roc_auc_score(TEST_train_3_y, preds)))

In [57]:
preds_test = kmeans.predict([list(X_test[i][0:6]) for i in range(len(X_test))])


In [59]:
print(len(preds_test))

60057


In [None]:
test_0 = np.array([])
test_1 = np.array([])
test_2 = np.array([])
# test_3 = np.array([])

test_0_y = []
test_1_y = []
test_2_y = []
# test_3_y = []

y_test = list(y_test)
i = 0
# print(list(y_train)[0])
for cluster in preds_test:
#     print(cluster)
    
    if cluster == 0:
        test_0 = np.append(test_0, list(X_test[i]))
        test_0_y.append(y_test[i])
    elif cluster == 1:
        test_1 = np.append(test_1, list(X_test[i]))
        test_1_y.append(y_test[i])
    elif cluster == 2:
        test_2 = np.append(test_2, list(X_test[i]))
        test_2_y.append(y_test[i])
#     elif cluster == 3:
#         test_3 = np.append(test_3, list(X_test[i]))      
#         test_3_y.append(y_test[i])

    i += 1
#     print(i)
    
test_0 = test_0.reshape(int(test_0.size / 37), 37)
test_1 = test_1.reshape(int(test_1.size / 37), 37)
test_2 = test_2.reshape(int(test_2.size / 37), 37)
# test_3 = test_3.reshape(int(test_3.size / 6), 6)


In [61]:
preds_1 = cls_0.predict(test_0)

preds_1 = list(preds_1)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(test_0_y, preds_1)))

print('Recall score: ' + str(metrics.recall_score(test_0_y, preds_1)))

print('Precision score: ' + str(metrics.precision_score(test_0_y, preds_1)))

preds_1 = cls_0.predict_proba(test_0)
preds_1 = preds_1[:,1]
fpr, tpr, _ = metrics.roc_curve(test_0_y, preds_1)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.820740861618799
Recall score: 0.992544731610338
Precision score: 0.8247294953332782
Auc score: 0.6850101123676882


In [62]:
preds_2 = cls_1.predict(test_1)
preds_2 = list(preds_2)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(test_1_y, preds_2)))

print('Recall score: ' + str(metrics.recall_score(test_1_y, preds_2)))

print('Precision score: ' + str(metrics.precision_score(test_1_y, preds_2)))

preds_2 = cls_1.predict_proba(test_1)
preds_2 = preds_2[:,1]
fpr, tpr, _ = metrics.roc_curve(test_1_y, preds_2)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.8140921251599396
Recall score: 0.9948543862783634
Precision score: 0.8166911117629804
Auc score: 0.6937795791017314


In [63]:
preds_3 = cls_2.predict(test_2)
preds_3 = list(preds_3)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(test_2_y, preds_3)))

print('Recall score: ' + str(metrics.recall_score(test_2_y, preds_3)))

print('Precision score: ' + str(metrics.precision_score(test_2_y, preds_3)))

preds_3 = cls_2.predict_proba(test_2)
preds_3 = preds_3[:,1]
fpr, tpr, _ = metrics.roc_curve(test_2_y, preds_3)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.6631626034444196
Recall score: 0.9123670060633795
Precision score: 0.6800545749125949
Auc score: 0.6500747515448375


In [31]:
# preds_4 = cls_3.predict(test_3)
# preds_4 = list(preds_4)
# print('XGB: ')
# print('Accuracy score: ' + str(metrics.accuracy_score(test_3_y, preds_4)))

# print('Recall score: ' + str(metrics.recall_score(test_3_y, preds_4)))

# print('Precision score: ' + str(metrics.precision_score(test_3_y, preds_4)))

# print('Auc score: ' + str(metrics.roc_auc_score(test_3_y, preds_4)))

In [64]:
preds = np.append(preds_1, preds_2)
preds = np.append(preds, preds_3)

test_y = np.append(test_0_y, test_1_y)
test_y = np.append(test_y, test_2_y)

fpr, tpr, _ = metrics.roc_curve(test_y, preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

Auc score: 0.7068139888888431
