In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import pandas as pd
import tensorflow as tf

# Import our input dataset
loan_df = pd.read_csv('Data_Source/loan.csv')
loan_df.head()




  loan_df = pd.read_csv('Data_Source/loan.csv')


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,


In [2]:
# find entirely null columns
# nullColumns = loan_df.columns[100*(loan_df.isnull().sum()/len(loan_df.index)) == 100]
# print(nullColumns)

In [3]:
# split into labels and features
y = loan_df['grade'] # or ['sub_grade']
X = loan_df[['home_ownership', 'annual_inc', 'loan_amnt', 'int_rate', 'term', 'mths_since_last_delinq', 'dti']]

In [4]:
# find rows with null values in specific columns
X.query('dti != dti') # 'column_name != column_name' shows where NaNs exist

Unnamed: 0,home_ownership,annual_inc,loan_amnt,int_rate,term,mths_since_last_delinq,dti
428,MORTGAGE,0.0,13000,10.72,60 months,,
593,RENT,0.0,18000,16.91,60 months,,
1605,MORTGAGE,0.0,35000,16.91,60 months,,
2647,RENT,0.0,5500,13.56,36 months,,
3404,MORTGAGE,0.0,4700,10.33,36 months,,
...,...,...,...,...,...,...,...
2259701,MORTGAGE,0.0,8000,7.97,36 months,,
2259799,RENT,0.0,8000,9.93,36 months,,
2259846,RENT,0.0,28000,13.59,36 months,,
2260530,MORTGAGE,0.0,15000,18.06,36 months,,


In [5]:
# create list of the 1700ish rows where dti is null
dti_null_list = list(X.query('dti != dti').index)

In [6]:
# drop the 4 nulls from annual_inc
X = X.drop(index=[2141934, 2141935, 2141965, 2142018])
y = y.drop(index=[2141934, 2141935, 2141965, 2142018])

In [7]:
# drop nulls 
X = X.dropna(axis='index', subset=['dti'])
y = y.drop(index=dti_null_list)
# for na_index in index_list:
#     y = y.drop(index=na_index)
# X.drop(df[ (df['Age'] >= 20) & (df['Age'] <= 25) ].index

In [8]:
# fillna with 0 (should only impact 'mths_since_last_delinq', leading to higher accuracy. nulls hurt the accuracy a lot)
# this step needs to be done after any other dropnas
X = X.fillna(0)

In [9]:
y = pd.get_dummies(y)

In [10]:
# setting dtype to float may not be necessary. Without this, default for many columns is boolean
X = pd.get_dummies(X, dtype=float)
X

Unnamed: 0,annual_inc,loan_amnt,int_rate,mths_since_last_delinq,dti,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,term_ 36 months,term_ 60 months
0,55000.0,2500,13.56,0.0,18.24,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,90000.0,30000,18.94,71.0,26.52,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,59280.0,5000,17.97,0.0,10.51,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,92000.0,4000,18.94,0.0,16.74,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,57250.0,30000,16.14,0.0,26.35,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260663,58000.0,12000,14.08,0.0,20.88,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2260664,30000.0,12000,25.82,13.0,19.28,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2260665,64000.0,10000,11.99,0.0,12.96,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2260666,60000.0,12000,21.45,7.0,30.82,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # stratify=y, test_size=0.2,

In [12]:
# standardize the data
scaler = StandardScaler()

X_scaled = scaler.fit(X_train)

X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)

In [13]:
# create layers for the neural network model. Number of layers is subjective and should be tested with more/fewer layers and nodes.
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=25, input_dim=13, activation='relu'))
model.add(tf.keras.layers.Dense(units=12, activation='relu'))
model.add(tf.keras.layers.Dense(units=7, activation='softmax'))




In [14]:
# lower the learning rate to prevent overfitting. overfitting was common at the default learning rate, causing accuracy to tank within 1 epoch
learning_rate = tf.Variable(0.0001, trainable=True)

tf.keras.backend.set_value(learning_rate, 0.0001)

In [15]:
# train the model and check accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

fit_model = model.fit(X_train_scaled, y_train, epochs=100)


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

KeyboardInterrupt: 

In [None]:
# pca will *not* apply to specific imput columns like I thought before
pca = PCA(n_components=13)
loanPCA = pca.fit_transform(X)
pca.explained_variance_ratio_