Importing necessary packages and the data

In [102]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

loan = pd.read_csv('loan.csv', low_memory=False)

Let's include only the loans classified as 'Fully Paid', 'Default', or 'Charged Off'

In [103]:
ended_loans = loan[(loan['loan_status'] == 'Fully Paid') | (loan['loan_status'] == 'Charged Off') | (loan['loan_status'] == 'Default')]

In [104]:
ended_loans['TARGET'] = ended_loans['loan_status'].apply(lambda x: 0 if x == 'Fully Paid' else 1)
ended_loans = ended_loans.fillna(-99999)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Now that we created the TARGET variable with integers, we can drop the loan_status variable

In [105]:
ended_loans.drop('loan_status', axis=1, inplace=True)

let's split the dataset into test and train by random permutation

In [106]:
np.random.seed(1)
ended_loans = ended_loans.reindex(np.random.permutation(ended_loans.index))
max_row = math.floor(ended_loans.shape[0]*0.8)
train = ended_loans.iloc[:max_row]
train_test = ended_loans.iloc[max_row:]

In [107]:
predictors = train.columns[:-1]

The first line only includes variables that are not 'objects'
then I eliminated the first two columns because they are ids and they may disrupt the accuracy of the classifier

In [108]:
num_var_train = list(train.dtypes[train.dtypes !='object'].index)
num_var_train = num_var_train[2:]
num_var_train = num_var_train[:-1]
print(train[num_var_train].head())

        loan_amnt  funded_amnt  funded_amnt_inv  int_rate  installment  \
851750       8000         8000             8000      8.67       253.18   
206064      20000        20000            20000     14.33       686.77   
162015       9000         9000             8975     12.12       299.45   
188743      27150        27150            27150     10.16       579.00   
641852       8500         8500             8500      7.26       263.47   

        annual_inc    dti  delinq_2yrs  inq_last_6mths  \
851750       41000  29.19            0               0   
206064       98000  22.14            0               0   
162015       72000   9.80            0               0   
188743       85000   9.85            0               0   
641852       60000   4.66            2               1   

        mths_since_last_delinq      ...       total_bal_il  il_util  \
851750                  -99999      ...             -99999   -99999   
206064                  -99999      ...             -99999   -99

In [109]:
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(train[num_var_train], train['TARGET'])
predictions = clf.predict(train_test[num_var_train])

error_test = roc_auc_score(predictions, train_test['TARGET'])
print("The roc_auc score is", error_test)

The roc_auc score is 0.986149489366


In [110]:
y_pred = predictions
y_true = train_test['TARGET']
accuracy = accuracy_score(y_true, y_pred)
print('the accuracy is', accuracy)

the accuracy is 0.977005389669


In [111]:
importances = clf.feature_importances_
print(importances)

[  3.10025106e-02   3.31081281e-02   2.63498612e-02   1.09621081e-02
   2.47063527e-02   6.73426846e-04   2.21622232e-03   2.71596277e-06
   1.50734690e-04   2.16961768e-05   7.78094527e-05   3.68371754e-05
   7.26325776e-05   8.47668933e-04   6.19824240e-04   1.80233191e-04
   4.10751001e-03   4.82510078e-03   6.91980588e-02   4.72254453e-02
   1.72871763e-01   8.62579855e-03   6.44143984e-03   2.23097716e-01
   1.50477643e-01   1.59272570e-01   2.76335521e-07   1.93555445e-04
   0.00000000e+00   0.00000000e+00   0.00000000e+00   4.39030986e-07
   3.48380822e-03   1.09325715e-02   5.13338557e-06   4.49522000e-06
   3.74469036e-06   0.00000000e+00   2.18420366e-06   9.26469202e-06
   0.00000000e+00   3.63971740e-07   3.85394544e-06   5.38822905e-06
   8.56740695e-06   8.16734949e-03   3.51597543e-06   2.38758082e-06
   1.29379715e-06]


In [115]:
df_importances = pd.DataFrame({
        'Variables': num_var_train,
        'Importances': importances
    })
df_importances = df_importances.sort(['Importances'], ascending=False)
print(df_importances)

     Importances                    Variables
23  2.230977e-01                   recoveries
20  1.728718e-01              total_rec_prncp
25  1.592726e-01              last_pymnt_amnt
24  1.504776e-01      collection_recovery_fee
18  6.919806e-02                  total_pymnt
19  4.722545e-02              total_pymnt_inv
1   3.310813e-02                  funded_amnt
0   3.100251e-02                    loan_amnt
2   2.634986e-02              funded_amnt_inv
4   2.470635e-02                  installment
3   1.096211e-02                     int_rate
33  1.093257e-02                  tot_cur_bal
21  8.625799e-03                total_rec_int
45  8.167349e-03             total_rev_hi_lim
22  6.441440e-03           total_rec_late_fee
17  4.825101e-03                out_prncp_inv
16  4.107510e-03                    out_prncp
32  3.483808e-03                 tot_coll_amt
6   2.216222e-03                          dti
13  8.476689e-04                    revol_bal
5   6.734268e-04                  

In [113]:
import matplotlib.pyplot as plt

