In [1]:
# Intro to Machine Learning CS 4347   Chichi Christine
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import metrics

# In this file, I did not use StandardScaler for Random Forests because 
# it is not necessary  

In [2]:
df = pd.read_csv("train.csv")

In [None]:
#To select rows whose column value equals 'Y'
df.loc[df['LoanStatus'] == 'Y']

In [None]:
df.shape

In [None]:
sns.pairplot(df)

In [3]:
df.Gender = df.Gender.fillna('Male')
df.Married = df.Married.fillna('Yes')
df.Dependents = df.Dependents.fillna('0')
df.Self_Employed = df.Self_Employed.fillna('No')
df.LoanAmount = df.LoanAmount.fillna(df.LoanAmount.mean())
df.Loan_Amount_Term = df.Loan_Amount_Term.fillna(360.0)
df.Credit_History = df.Credit_History.fillna(1.0)
df.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
LoanStatus           0
dtype: int64

In [None]:
corr = df.corr()
sns.heatmap(corr)

In [4]:
y = df['LoanStatus']

In [5]:
df = df.drop(['Loan_ID'], axis=1)  # drop the column, not relevant

In [6]:
# Clean data
# Replace the categorical values with the numeric equivalents 
categoricalFeatures = [ 'Property_Area', 'Gender', 'Married', 
                       'Dependents', 'Education', 'Self_Employed']

# Iterate through the list of categorical features and one hot encode them.
for feature in categoricalFeatures:
    onehot = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(feature, axis=1)  # drop the column
    df = df.join(onehot)

In [7]:
y = (y == 'Y')*1.0
y.head()

0    1.0
1    0.0
2    1.0
3    1.0
4    1.0
Name: LoanStatus, dtype: float64

In [8]:
# x.shape,y.shape
x = df.drop(['LoanStatus'], axis=1)
#x = df.values
x.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes
0,5849,0.0,146.412162,360.0,1.0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0
1,4583,1508.0,128.0,360.0,1.0,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0
2,3000,0.0,66.0,360.0,1.0,0,0,1,0,1,0,1,1,0,0,0,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,0,0,1,0,1,0,1,1,0,0,0,0,1,1,0
4,6000,0.0,141.0,360.0,1.0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [10]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [11]:
y_pred = classifier.predict(x_test)
print('Accuracy: ', metrics.accuracy_score(y_pred, y_test))

Accuracy:  0.7723577235772358


In [None]:
list(zip(x_train, classifier.feature_importances_))

In [None]:
# Get numerical feature importances
importances = list(zip(classifier.feature_importances_))
# List of tuples with variable and importance
feature_importances = [(feature, importance) for feature, importance in zip(x_train, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [12]:
# Remove less relevant features and increase number of decision trees
# Removed Gender, Married, Self_Employed, Dependents
x2 = list(['Property_Area_Rural']+['Property_Area_Semiurban']+['Property_Area_Urban']
          + ['Education_Graduate']+ ['Education_Not Graduate'] 
          + ['ApplicantIncome'] + ['CoapplicantIncome']+ ['LoanAmount']
             +['Loan_Amount_Term'] +['Credit_History'])
df2 = df[x2]
xtree = df2
xtree.head()

Unnamed: 0,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Education_Graduate,Education_Not Graduate,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,0,0,1,1,0,5849,0.0,146.412162,360.0,1.0
1,1,0,0,1,0,4583,1508.0,128.0,360.0,1.0
2,0,0,1,1,0,3000,0.0,66.0,360.0,1.0
3,0,0,1,0,1,2583,2358.0,120.0,360.0,1.0
4,0,0,1,1,0,6000,0.0,141.0,360.0,1.0


In [13]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(xtree, y, test_size=0.2)
cl = RandomForestClassifier(n_estimators = 200, criterion = 'gini', random_state = 0)
cl.fit(x_train2, y_train2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
y_pred2 = cl.predict(x_test2)
print('Accuracy: ', metrics.accuracy_score(y_pred2, y_test2))

Accuracy:  0.8292682926829268
