In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics


%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

data = data.dropna()
numeric_data = data.drop(['record_id'], axis=1)
# numeric_data = numeric_data.sort_values(['emp_length'])
numeric_data['term'] = pd.factorize(numeric_data['term'])[0]

numeric_data['emp_length'] = numeric_data['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3,  '3 years': 4,  '4 years': 5,  '5 years': 6,  '6 years': 7,  '7 years': 8,  '8 years': 9,  '9 years': 10,  '10+ years': 11})

numeric_data['verification_status'] = numeric_data['verification_status'].map({'Not Verified': 0, 'Verified': 1, 'Source Verified': 2})

numeric_data['application_type'] = pd.factorize(numeric_data['application_type'])[0]

numeric_data['initial_list_status'] = pd.factorize(numeric_data['initial_list_status'])[0]

numeric_data['addr_state'] = pd.factorize(numeric_data['addr_state'])[0]
    
# numeric_data['addr_state'] = pd.factorize(numeric_data['addr_state'])[0]

numeric_data['zip_code'] = (numeric_data['zip_code'].str[:3])

In [3]:
numeric_data['emp_length'].fillna(0, inplace=True)

numeric_data['emp_title'].fillna('0', inplace=True)

numeric_data['mths_since_last_delinq'].fillna(numeric_data['mths_since_last_delinq'].notnull().min(), inplace=True)

numeric_data['collections_12_mths_ex_med'].fillna(numeric_data['collections_12_mths_ex_med'].notnull().max(), inplace=True)

numeric_data['revol_util'].fillna(numeric_data['revol_util'].notnull().mean(), inplace=True)

numeric_data['tot_coll_amt'].fillna(numeric_data['tot_coll_amt'].notnull().min(), inplace=True)

numeric_data['tot_cur_bal'].fillna(numeric_data['tot_cur_bal'].notnull().min(), inplace=True)

numeric_data['total_rev_hi_lim'].fillna(numeric_data['total_rev_hi_lim'].notnull().min(), inplace=True)

In [4]:
def month_to_decimal(month):
    month_dict = {'Jan':0, 'Feb':1/12., 'Mar':2/12., 'Apr':3/12., 'May':4/12., 'Jun':5/12., 
     'Jul':6/12., 'Aug':7/12., 'Sep':8/12., 'Oct':9/12., 'Nov':10/12., 'Dec':11/12.}
    return month_dict[month]

def convert_date(month_year):
    month_and_year = month_year.split('-')
    return float(month_and_year[1]) + month_to_decimal(month_and_year[0])
numeric_data['issue_d'] = numeric_data['issue_d'].map(convert_date)
numeric_data['earliest_cr_line'] = numeric_data['earliest_cr_line'].map(convert_date)

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale


le = LabelEncoder()
# print(str(numeric_data['grade'].values))
numeric_data[numeric_data['grade'] == 0] = '0'
le.fit(numeric_data.grade.astype(str))
# print(le.classes_)
# numeric_data.info()
numeric_data['grade_le'] = le.transform((numeric_data['grade'].values))

le.fit(numeric_data.sub_grade.astype(str))
numeric_data['sub_grade_le'] = le.transform(numeric_data['sub_grade'].values)

le.fit(numeric_data.emp_title.astype(str))
numeric_data['emp_title_le'] = le.transform(numeric_data['emp_title'].values)

# le.fit(numeric_data.addr_state.astype(str))
# numeric_data['addr_state_le'] = le.transform(numeric_data['addr_state'].values)

le.fit(numeric_data.pymnt_plan.astype(str))
numeric_data['pymnt_plan_le'] = le.transform(numeric_data['pymnt_plan'].values)

le.fit(numeric_data.purpose.astype(str))
numeric_data['purpose_le'] = le.transform(numeric_data['purpose'].values)

ohe = OneHotEncoder()
new_ohe_features = ohe.fit(numeric_data.home_ownership.values.reshape(-1, 1))
data['home_ownership_ohe'] = ohe.transform(numeric_data.home_ownership.values.reshape(-1, 1))

numeric_data = numeric_data.drop(['grade', 'sub_grade', 'purpose', 'emp_title', 'pymnt_plan', 'home_ownership'], axis=1)

In [31]:
X = numeric_data.drop(['loan_status'], axis=1)
y = numeric_data['loan_status']
# type(numeric_data['zip_code'].astype(int))
X = scale(X)

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X, y = ros.fit_resample(X, y)


  after removing the cwd from sys.path.


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


forest_tree = RandomForestClassifier(n_estimators=100)
forest_tree.fit(X_train, y_train)

forest_params = {'min_samples_leaf': [3, 5]}

forest_grid = GridSearchCV(forest_tree, forest_params,
cv=5, n_jobs=-1,
verbose=True)

forest_grid.fit(X_train, y_train)

forest_grid.best_params_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.6min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.8min finished


{'min_samples_leaf': 3}

In [29]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


forest_tree = RandomForestClassifier(min_samples_leaf=3)
forest_tree.fit(X_train, y_train)

forest_params = {'n_estimators': [100, 200, 500],
                'max_depth':[3, 5, 7, 9]}

forest_grid = GridSearchCV(forest_tree, forest_params,
cv=5, n_jobs=-1,
verbose=True)

forest_grid.fit(X_train, y_train)

forest_grid.best_params_



Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 28.1min finished


{'max_depth': 9, 'n_estimators': 200}

In [31]:
forest_tree = RandomForestClassifier(min_samples_leaf=3, max_depth=9, n_estimators=200)
forest_tree.fit(X_train, y_train)
preds = forest_tree.predict(X_test)

print('Forest: ')

print('Recall score: ' + str(metrics.recall_score(y_test, preds)))

print('Precision score: ' + str(metrics.precision_score(y_test, preds)))

print('Auc score: ' + str(metrics.roc_auc_score(y_test, preds)))

Forest: 
Recall score: 0.6483192199127534
Precision score: 0.6764239976796591
Auc score: 0.6686581410395153


In [23]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

tree_params = {'max_depth': [3, 5, 7, 9],
                'min_samples_leaf': [3, 5, 7]}

tree_grid = GridSearchCV(tree, tree_params,
cv=5, n_jobs=-1,
verbose=True)

tree_grid.fit(X_train, y_train)

tree_grid.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   43.8s finished


{'max_depth': 9, 'min_samples_leaf': 3}

In [27]:
preds = tree_grid.predict(X_test)

print('Tree: ')

print('Recall score: ' + str(metrics.recall_score(y_test, preds)))

print('Precision score: ' + str(metrics.precision_score(y_test, preds)))

print('Auc score: ' + str(metrics.roc_auc_score(y_test, preds)))

Tree: 
Recall score: 0.6201565306646138
Precision score: 0.6731269148639867
Auc score: 0.6590800451877741


In [7]:
import math


def euclideanDistance(instance1, instance2):
    distance = 0
    for x in range(len(instance1)):
        distance += (int(instance1[x]) - int(instance2[x])) ** 2
    return math.sqrt(distance)

In [8]:
import operator 


def getNeighbors(X_train, y_train, X_test, k):
    distances = []
    length = len(X_test) 
    neighbors = []
    i = 0
    for x in X_train:
        distance = euclideanDistance(X_test, x)
        distances.append((y_train[i], distance))
        i += 1
    distances.sort(key=operator.itemgetter(1))
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [9]:
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key = operator.itemgetter(1), reverse=True)
    return sortedVotes

In [None]:
predictions = []
k = 3
X = numeric_data.drop(['loan_status'], axis=1)
y = numeric_data['loan_status']
X = scale(X)

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=0)
X, y = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

i = 0
for x in X_test:
    neighbors = getNeighbors(X_train, y_train, x, k)
    if neighbors is not None:
        result = getResponse(neighbors)
        predictions.append(result[0][0])
        print('predicted: ' + str(result))
        print('actual: ' + str(y_test[i]))
        i += 1
print('Own Knn: ')

print('Recall score: ' + str(metrics.recall_score(y_test, predictions)))

print('Precision score: ' + str(metrics.precision_score(y_test, predictions)))

print('Auc score: ' + str(metrics.roc_auc_score(y_test, predictions)))