In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, scale
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics


%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

In [3]:
def preprocessing_data(data, unique_home_ownership=None, unique_verification=None, dict_grade=None,dict_sub_grade=None, dict_emp_title=None, dict_pymnt_plan=None, dict_purpose=None):
    numeric_data = data.drop(['record_id', 'earliest_cr_line', 'issue_d', 'addr_state', 'zip_code'], axis=1)
    
    numeric_data['term'] = numeric_data['term'].map({' 36 months': 0, ' 60 months': 1})
    numeric_data['application_type'] = numeric_data['application_type'].map({'INDIVIDUAL': 0, 'JOINT': 1})
    numeric_data['initial_list_status'] = numeric_data['initial_list_status'].map({'f': 0, 'w': 1})
    numeric_data['emp_length'] = numeric_data['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3,  '3 years': 4,  '4 years': 5,  '5 years': 6,  '6 years': 7,  '7 years': 8,  '8 years': 9,  '9 years': 10,  '10+ years': 11})

    numeric_data['emp_length'].fillna(0, inplace=True)
    numeric_data['emp_title'].fillna('0', inplace=True)
    numeric_data['mths_since_last_delinq'].fillna(numeric_data['mths_since_last_delinq'].notnull().min())
    numeric_data['collections_12_mths_ex_med'].fillna(numeric_data['collections_12_mths_ex_med'].notnull().max())
    numeric_data['revol_util'].fillna(numeric_data['revol_util'].notnull().mean())
    numeric_data['tot_coll_amt'].fillna(numeric_data['tot_coll_amt'].notnull().min())
    numeric_data['tot_cur_bal'].fillna(numeric_data['tot_cur_bal'].notnull().min())
    numeric_data['total_rev_hi_lim'].fillna(numeric_data['total_rev_hi_lim'].notnull().min())
  
    home_ownership_list = numeric_data.home_ownership.values.reshape(-1, 1)

    numeric_data = pd.concat((numeric_data,pd.get_dummies(numeric_data.home_ownership)),1)
    
    if unique_home_ownership is None:
        unique_home_ownership = numeric_data['home_ownership'].unique()
        
    for i in unique_home_ownership:
        try:
            numeric_data[i]
        except KeyError:
            numeric_data[i] = 0
    
    verification_list = numeric_data.verification_status.values.reshape(-1, 1)

    numeric_data = pd.concat((numeric_data,pd.get_dummies(numeric_data.verification_status)),1)
    
    if unique_verification is None:
        unique_verification = numeric_data['verification_status'].unique()
        
    for i in unique_verification:
        try:
            numeric_data[i]
        except KeyError:
            numeric_data[i] = 0
    
    le = LabelEncoder()

    if dict_grade is None:
        le.fit(numeric_data.grade.astype(str))
        numeric_data['grade_le'] = le.transform((numeric_data['grade'].values))
        dict_grade = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['grade_le'] = numeric_data['grade'].map(dict_grade).fillna(-1)  

    if dict_sub_grade is None:
        le.fit(numeric_data.sub_grade.astype(str))
        numeric_data['sub_grade_le'] = le.transform(numeric_data['sub_grade'].values)
        dict_sub_grade = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['sub_grade_le'] = numeric_data['sub_grade'].map(dict_sub_grade).fillna(-1)  
    
    if dict_emp_title is None:
        le.fit(numeric_data.emp_title.astype(str))
        numeric_data['emp_title_le'] = le.transform(numeric_data['emp_title'].values)
        dict_emp_title = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['emp_title_le'] = numeric_data['emp_title'].map(dict_emp_title).fillna(-1)        

    if dict_pymnt_plan is None:
        le.fit(numeric_data.pymnt_plan.astype(str))
        numeric_data['pymnt_plan_le'] = le.transform(numeric_data['pymnt_plan'].values)
        dict_pymnt_plan = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        numeric_data['pymnt_plan_le'] = numeric_data['pymnt_plan'].map(dict_pymnt_plan).fillna(0)  
    
    if dict_purpose is None:
        le.fit(numeric_data.purpose.astype(str))
        numeric_data['purpose_le'] = le.transform(numeric_data['purpose'].values)
        dict_purpose = dict(zip(le.classes_, le.transform(le.classes_)))
#         dict_purpose[None] = 5
    else:
        numeric_data['purpose_le'] = numeric_data['purpose'].map(dict_purpose).fillna(-1) 
#     print(dict_grade)

    numeric_data = numeric_data.drop(['grade', 'sub_grade', 'purpose', 'home_ownership','emp_title', 'pymnt_plan',  'verification_status'], axis=1)
    
    return numeric_data, unique_home_ownership, unique_verification, dict_grade, dict_sub_grade, dict_emp_title, dict_pymnt_plan, dict_purpose

In [4]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [5]:
train, unique_home_ownership, unique_verification, dict_grade, dict_sub_grade, dict_emp_title, dict_pymnt_plan, dict_purpose = preprocessing_data(train)
test = preprocessing_data(test,unique_home_ownership, unique_verification, dict_grade, dict_sub_grade, dict_emp_title, dict_pymnt_plan, dict_purpose)[0]

In [6]:
# X = new_data
X_train = train.drop(['loan_status'], axis=1)
y_train = train[['loan_status']]

X_test = test.drop(['loan_status'], axis=1)
y_test = test[['loan_status']]

# X_train = scale(X_train)
# X_test = scale(X_test)

In [7]:
from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=3, random_state=32)
preds_train = kmeans.fit_predict(X_train[['dti', 'int_rate', 'emp_length', 'loan_amnt', 'installment']])

X_train['clusters'] = preds_train
y_train['clusters'] = preds_train
# preds_train = np.array(preds_train).reshape(int(X_train.size / 7), 1)

# X_train = np.hstack((X_train, preds_train))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
train_0 = X_train[X_train['clusters'] == 0]
train_1 = X_train[X_train['clusters'] == 1]
train_2 = X_train[X_train['clusters'] == 2]

train_0_y = y_train[y_train['clusters'] == 0]
train_1_y = y_train[y_train['clusters'] == 1]
train_2_y = y_train[y_train['clusters'] == 2]

In [9]:
print(len(train_0))
print(len(train_1))
print(len(train_2))

51986
66531
21615


In [10]:
TRAIN_train_0, TEST_train_0, TRAIN_train_0_y, TEST_train_0_y = train_test_split(train_0, train_0_y, test_size=0.3, random_state=42)
TRAIN_train_1, TEST_train_1, TRAIN_train_1_y, TEST_train_1_y = train_test_split(train_1, train_1_y, test_size=0.3, random_state=42)
TRAIN_train_2, TEST_train_2, TRAIN_train_2_y, TEST_train_2_y = train_test_split(train_2, train_2_y, test_size=0.3, random_state=42)
# TRAIN_train_3, TEST_train_3, TRAIN_train_3_y, TEST_train_3_y = train_test_split(train_3, train_3_y, test_size=0.3, random_state=42)

In [12]:
import xgboost as xgb

cls_0 = xgb.XGBClassifier()
cls_0.fit(TRAIN_train_0, TRAIN_train_0_y['loan_status'])
preds = cls_0.predict(TEST_train_0)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_0_y['loan_status'], preds)))

print('Recall score: ' + str(metrics.recall_score(TEST_train_0_y['loan_status'], preds)))

print('Precision score: ' + str(metrics.precision_score(TEST_train_0_y['loan_status'], preds)))

preds = cls_0.predict_proba(TEST_train_0)
preds = preds[:,1]
fpr, tpr, _ = metrics.roc_curve(TEST_train_0_y['loan_status'], preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.7710951526032316
Recall score: 0.9775328172332548
Precision score: 0.7786193029490617
Auc score: 0.7272584562045172


In [13]:
cls_1 = xgb.XGBClassifier()
cls_1.fit(TRAIN_train_1, TRAIN_train_1_y['loan_status'])
preds = cls_1.predict(TEST_train_1)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_1_y['loan_status'], preds)))

print('Recall score: ' + str(metrics.recall_score(TEST_train_1_y['loan_status'], preds)))

print('Precision score: ' + str(metrics.precision_score(TEST_train_1_y['loan_status'], preds)))

preds = cls_0.predict_proba(TEST_train_1)
preds = preds[:,1]
fpr, tpr, _ = metrics.roc_curve(TEST_train_1_y['loan_status'], preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.8061623246492986
Recall score: 0.9910224438902743
Precision score: 0.8101523877478212
Auc score: 0.700189274899486


In [14]:
cls_2 = xgb.XGBClassifier()
cls_2.fit(TRAIN_train_2, TRAIN_train_2_y['loan_status'])
preds = cls_2.predict(TEST_train_2)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_2_y['loan_status'], preds)))

print('Recall score: ' + str(metrics.recall_score(TEST_train_2_y['loan_status'], preds)))

print('Precision score: ' + str(metrics.precision_score(TEST_train_2_y['loan_status'], preds)))

preds = cls_0.predict_proba(TEST_train_2)
preds = preds[:,1]
fpr, tpr, _ = metrics.roc_curve(TEST_train_2_y['loan_status'], preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.7312259059367772
Recall score: 0.9442671771963412
Precision score: 0.7498310810810811
Auc score: 0.6979118077157518


In [15]:
# cls_3 = xgb.XGBClassifier()
# cls_3.fit(TRAIN_train_3, TRAIN_train_3_y)
# preds = cls_3.predict(TEST_train_3)
# print('XGB: ')
# print('Accuracy score: ' + str(metrics.accuracy_score(TEST_train_3_y, preds)))

# print('Recall score: ' + str(metrics.recall_score(TEST_train_3_y, preds)))

# print('Precision score: ' + str(metrics.precision_score(TEST_train_3_y, preds)))

# print('Auc score: ' + str(metrics.roc_auc_score(TEST_train_3_y, preds)))

In [15]:
preds_test = kmeans.predict(X_test[['dti', 'int_rate', 'emp_length', 'loan_amnt', 'installment']])

X_test['clusters'] = preds_test
y_test['clusters'] = preds_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [16]:
test_0 = X_test[X_test['clusters'] == 0]
test_1 = X_test[X_test['clusters'] == 1]
test_2 = X_test[X_test['clusters'] == 2]

test_0_y = y_test[y_test['clusters'] == 0]
test_1_y = y_test[y_test['clusters'] == 1]
test_2_y = y_test[y_test['clusters'] == 2]

In [17]:
test_0 = test_0[TRAIN_train_0.columns]
preds_1 = cls_0.predict(test_0)

preds_1 = list(preds_1)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(test_0_y['loan_status'], preds_1)))

print('Recall score: ' + str(metrics.recall_score(test_0_y['loan_status'], preds_1)))

print('Precision score: ' + str(metrics.precision_score(test_0_y['loan_status'], preds_1)))

preds_1 = cls_0.predict_proba(test_0)
preds_1 = preds_1[:,1]
fpr, tpr, _ = metrics.roc_curve(test_0_y['loan_status'], preds_1)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.7722638882651457
Recall score: 0.9586598058706584
Precision score: 0.7897779276458403
Auc score: 0.7150373596924235


In [18]:
test_1 = test_1[TRAIN_train_1.columns]
preds_2 = cls_1.predict(test_1)
preds_2 = list(preds_2)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(test_1_y['loan_status'], preds_2)))

print('Recall score: ' + str(metrics.recall_score(test_1_y['loan_status'], preds_2)))

print('Precision score: ' + str(metrics.precision_score(test_1_y['loan_status'], preds_2)))

preds_2 = cls_1.predict_proba(test_1)
preds_2 = preds_2[:,1]
fpr, tpr, _ = metrics.roc_curve(test_1_y['loan_status'], preds_2)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.8025369244135534
Recall score: 0.9842482100238663
Precision score: 0.8100424984821971
Auc score: 0.7004053719920469


In [19]:
test_2 = test_2[TRAIN_train_2.columns]
preds_3 = cls_2.predict(test_2)
preds_3 = list(preds_3)
print('XGB: ')
print('Accuracy score: ' + str(metrics.accuracy_score(test_2_y['loan_status'], preds_3)))

print('Recall score: ' + str(metrics.recall_score(test_2_y['loan_status'], preds_3)))

print('Precision score: ' + str(metrics.precision_score(test_2_y['loan_status'], preds_3)))

preds_3 = cls_2.predict_proba(test_2)
preds_3 = preds_3[:,1]
fpr, tpr, _ = metrics.roc_curve(test_2_y['loan_status'], preds_3)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

XGB: 
Accuracy score: 0.7393233499722685
Recall score: 0.9489380930863082
Precision score: 0.7580315244856215
Auc score: 0.6941674109169215


In [21]:
# preds_4 = cls_3.predict(test_3)
# preds_4 = list(preds_4)
# print('XGB: ')
# print('Accuracy score: ' + str(metrics.accuracy_score(test_3_y, preds_4)))

# print('Recall score: ' + str(metrics.recall_score(test_3_y, preds_4)))

# print('Precision score: ' + str(metrics.precision_score(test_3_y, preds_4)))

# print('Auc score: ' + str(metrics.roc_auc_score(test_3_y, preds_4)))

In [20]:
preds = np.append(preds_1, preds_2)
preds = np.append(preds, preds_3)

test_y = np.append(test_0_y['loan_status'], test_1_y['loan_status'])
test_y = np.append(test_y, test_2_y['loan_status'])

fpr, tpr, _ = metrics.roc_curve(test_y, preds)
print('Auc score: ' + str(metrics.auc(fpr, tpr)))

Auc score: 0.7079491642773359
