# Avant Data Challange

In [11]:
import numpy as np
import pandas as pd
import gc

In [18]:
print("Read data file to obtain some statistic about the features and target")
data = pd.read_csv('data.csv')
print "shape of data file: ", data.shape

Read data file to obtain some statistic about the features and target
shape of data file:  (80000, 26)


In [19]:
data = data.drop(["last_credit_pull_d", "last_fico_range_high", "last_fico_range_low"], axis=1)
print "shape of data file: ", data.shape

shape of data file:  (80000, 23)


In [20]:
# missing values
data = data.drop(["mths_since_last_record"], axis=1)
data["inq_last_12m"] = data["inq_last_12m"].fillna(np.median(data["inq_last_12m"].dropna()))
data["mths_since_last_delinq"] = data["mths_since_last_delinq"].fillna(np.median(data["mths_since_last_delinq"].dropna()))
print "shape of data file: ", data.shape

shape of data file:  (80000, 22)


In [5]:
numerical_features = []
categorical_features = []
for col, dtype in zip(data.columns, data.dtypes):
    if dtype == "object":
        categorical_features.append(col)
    else:
        numerical_features.append(col)

print "List of categorical fetures: "
print categorical_features
print "List of numerical fetures: "
print numerical_features

List of categorical fetures: 
['term', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'purpose', 'addr_state', 'earliest_cr_line']
List of numerical fetures: 
['id', 'loan_amnt', 'installment', 'annual_inc', 'dti', 'fico_range_low', 'fico_range_high', 'acc_now_delinq', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq', 'inq_last_6mths', 'inq_last_12m']


In [6]:
# There are three fetures for date that should be splited into month and year
date_features = ["issue_d", "earliest_cr_line"]

# I define a dictionary to convert month to integer variable
months = {"Jan" : 1, "Feb" : 2, "Mar" : 3, "Apr" : 4, "May" : 5, "Jun": 6, "Jul" : 7, "Aug" : 8, "Sep" : 9,
          "Oct" : 10, "Nov" : 11, "Dec" :12}

for col in date_features:
    print col
    name = col + "_month"
    data[name] = map(lambda x: months[x.split("-")[0]], data[col])
    name = col + "_year"
    data[name] = map(lambda x: np.int(x.split("-")[1]), data[col])
    
# Drop the original features
data = data.drop(date_features, axis=1)

issue_d
earliest_cr_line


In [7]:
# clean emp_length
print "list of the objects in emp_length:"
print list(data["emp_length"].value_counts().index)

# built a dictionary 
emp_dict = {'10+ years' : 10, '2 years' : 2, '3 years' : 3, '< 1 year' : 0, '1 year' : 1, 'n/a' : -1, 
            '5 years' : 5, '4 years' : 4, '8 years' : 8, '6 years' : 6, '9 years' : 9, '7 years' : 7}

# I replace missing value with -1
data["emp_length"] = map(lambda x: emp_dict[x], data["emp_length"])

list of the objects in emp_length:
['10+ years', '2 years', '3 years', '< 1 year', '1 year', 'n/a', '5 years', '4 years', '8 years', '6 years', '9 years', '7 years']


In [8]:
object_type = ['term', 'home_ownership', 'verification_status', 'purpose', 'addr_state']
data_dummies = pd.get_dummies(data[object_type])

In [9]:
# convert object features to int
from sklearn.preprocessing import LabelEncoder
object_type = ['term', 'home_ownership', 'verification_status', 'purpose', 'addr_state']
print("convert object features to int")
for c in object_type:
    data[c] = data[c].fillna(-2)
    lbl = LabelEncoder()
    lbl.fit(list(data[c].values))
    data[c] = lbl.transform(list(data[c].values))

convert object features to int


In [10]:
#Create target variable
data["pay_months"] = data["issue_d_year"] - 2015
data["pay_months"] = (13-data["issue_d_month"]) + 12 * data["pay_months"]

threshold = 0.5 
index_36 = (data.term == 0) & (data.pay_months >= 36*threshold) & (data.loan_status == "Current")
index_60 = (data.term == 1) & (data.pay_months >= 60*threshold) & (data.loan_status == "Current")

index = index_36 | index_60
print "Number of Current borrowers who paied more than 50% of entire loan ", sum(index)

Number of Current borrowers who paied more than 50% of entire loan  18033


In [11]:
index_paied = data.loan_status == "Fully Paid"
index_default = data.loan_status == "Default"
index_paied_default = index_paied | index_default
print "Number of Fully Paid borrowers who paied entire loan ", sum(index_paied)
print "Number of Default borrowers ", sum(index_default)

Number of Fully Paid borrowers who paied entire loan  11534
Number of Default borrowers  6037


In [12]:
index = index | index_paied_default
print "Number of data that can be used for training ", sum(index)

Number of data that can be used for training  35604


In [14]:
# create a train data set
train = data[index]

train.loc[:,"target"] = 0
train.loc[:, "target"] = (train.loan_status == "Default") * 1
train = train.drop("loan_status", axis=1)

In [15]:
# reduce highly correlated feaetures
train["fico_range"] = (train["fico_range_high"] + train["fico_range_low"]) / 2.0
train = train.drop(["fico_range_high", "fico_range_low"], axis=1)

In [16]:
train.to_csv("clean_data_tree.csv", index=False)

In [22]:
# make a data set for logistic regression
train_lr = pd.concat([data, data_dummies], axis=1)
train_lr = train_lr.drop(object_type, axis=1)
print train_lr.shape

(80000, 92)


In [23]:
train_lr = train_lr[index]

train_lr.loc[:,"target"] = 0
train_lr.loc[:, "target"] = (train_lr.loan_status == "Default") * 1
train_lr = train_lr.drop("loan_status", axis=1)

In [24]:
# reduce highly correlated feaetures
train_lr["fico_range"] = (train_lr["fico_range_high"] + train_lr["fico_range_low"]) / 2.0
train_lr = train_lr.drop(["fico_range_high", "fico_range_low"], axis=1)

In [25]:
train_lr.to_csv("clean_data_LR.csv", index=False)