In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import sklearn as sk
import torch as tt
import torchvision as ttv
import statsmodels.api as sm
import plotly.express as px
import plotly.io as pio
import nltk
import string
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [6]:
# apply same logic as in analysis

df = pd.read_csv("loan_data.csv")
df['Approval'].replace(['Approved', 'Rejected'], [1,0], inplace=True)
df['Employment_Status'].replace(['employed', 'unemployed'], [1,0], inplace=True)

data = df[(df['DTI_Ratio'] >= 0) & 
            (df['Credit_Score'] >= 550) & 
            (df['Loan_Amount'] < 120000)]

def preprocess(text):
    text = text.lower() 
    text = text.strip()
    text = re.compile(r'[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d',' ',text) 
    return text

def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

wl = WordNetLemmatizer()
 
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) 
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in word_pos_tags] 
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

data['clean_text'] = data['Text'].apply(lambda x: finalpreprocess(x))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Approval'].replace(['Approved', 'Rejected'], [1,0], inplace=True)
  df['Approval'].replace(['Approved', 'Rejected'], [1,0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Employment_Status'].replace(['employed', 'unemployed'], [1,0], inplace=True)
  df['Emp

In [43]:
X_text = data['clean_text']
y = data['Approval']
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

# convert to numbers and only take 500 most impportant words ** possibly increase/decrease **
tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray() # array to be handles by tensor/torch
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray() # array to be handles by tensor/torch

# Numeric features
numeric_cols = ['Income', 'Credit_Score', 'Loan_Amount', 'DTI_Ratio', 'Employment_Status']
X_train_num = data.loc[X_train.index][numeric_cols]
X_test_num = data.loc[X_test.index][numeric_cols]

# 0 mean and unit variance for input into NN
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

scaler_reduced = StandardScaler()
X_train_num_scaled_reduced = scaler_reduced.fit_transform(X_train_num.drop(['Income'], axis=1))
X_num_test_scaled_reduced = scaler_reduced.transform(X_test_num.drop(['Income'], axis=1))

**PyTorch Model**

In [44]:
# X_train_num_scaled_reduced = scaler.fit_transform(X_train_num.drop(['Loan_Amount'], axis=1))# 
# ^^ use to test without loan_amount ^^

class CombinedTorchModel(nn.Module):
    # initialize 3 layers
    def __init__(self, text_dim, num_dim):
        super().__init__()
        self.text_layer = nn.Linear(text_dim, 64)
        self.num_layer = nn.Linear(num_dim, 32)
        self.combined_layer = nn.Linear(96, 1)

    # pass into model
    def forward(self, text_input, num_input):
        x_text = torch.relu(self.text_layer(text_input))
        x_num = torch.relu(self.num_layer(num_input))
        x = torch.cat((x_text, x_num), dim=1) # concatenates the outputs from the two branches
        return torch.sigmoid(self.combined_layer(x)) # output between 0 and 1
    
X_text_train_torch = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_num_train_torch = torch.tensor(X_train_num_scaled_reduced, dtype=torch.float32)
# use unsqueeze(1) to match what model expects
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1) 

X_text_test_torch = torch.tensor(X_test_tfidf, dtype=torch.float32)
X_num_test_torch = torch.tensor(X_num_test_scaled_reduced, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

In [47]:
model = CombinedTorchModel(X_train_tfidf.shape[1], X_train_num_scaled_reduced.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    output = model(X_text_train_torch, X_num_train_torch)
    loss = criterion(output, y_train_torch)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    y_pred_probs = model(X_text_test_torch, X_num_test_torch).numpy()
    y_pred = (y_pred_probs > 0.5).astype(int)

# Apply post-model auto-rejection rules
X_test_num_reset = X_test_num.reset_index(drop=True)
for i, row in X_test_num_reset.iterrows():
    if row['Loan_Amount'] >= 120000 or row['Credit_Score'] < 550 or row['DTI_Ratio'] > 50:
        y_pred[i] = 0

print("\nPyTorch Classification Report:")
print(classification_report(y_test, y_pred))

Epoch 1, Loss: 0.6598
Epoch 2, Loss: 0.6557
Epoch 3, Loss: 0.6518
Epoch 4, Loss: 0.6479
Epoch 5, Loss: 0.6440
Epoch 6, Loss: 0.6403
Epoch 7, Loss: 0.6365
Epoch 8, Loss: 0.6328
Epoch 9, Loss: 0.6292
Epoch 10, Loss: 0.6256

PyTorch Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.81      1701
           1       0.00      0.00      0.00       820

    accuracy                           0.67      2521
   macro avg       0.34      0.50      0.40      2521
weighted avg       0.46      0.67      0.54      2521



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**TensorFlow Model**

In [18]:
input_text = Input(shape=(X_train_tfidf.shape[1],))
input_num = Input(shape=(X_train_num_scaled.shape[1],))
text_branch = Dense(64, activation='relu')(input_text)
num_branch = Dense(32, activation='relu')(input_num)
combined = Concatenate()([text_branch, num_branch])
output = Dense(1, activation='sigmoid')(combined)

model_tf = Model(inputs=[input_text, input_num], outputs=output)
model_tf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_tf.fit([X_train_tfidf, X_train_num_scaled], y_train, epochs=10, batch_size=32, verbose=1)

y_pred_tf = (model_tf.predict([X_test_tfidf, X_test_num_scaled]) > 0.5).astype(int)

# Apply post-model auto-rejection rules
for i, row in enumerate(X_test_num):
    if row['Loan_Amount'] >= 120000 or row['Credit_Score'] < 550 or row['DTI_Ratio'] > 50:
        y_pred_tf[i] = 0

print("\nTensorFlow Classification Report:")
print(classification_report(y_test, y_pred_tf))

Epoch 1/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 605us/step - accuracy: 0.7298 - loss: 0.5147
Epoch 2/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 563us/step - accuracy: 0.9491 - loss: 0.1715
Epoch 3/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 524us/step - accuracy: 0.9612 - loss: 0.1167
Epoch 4/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497us/step - accuracy: 0.9689 - loss: 0.0946
Epoch 5/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/step - accuracy: 0.9761 - loss: 0.0803
Epoch 6/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487us/step - accuracy: 0.9789 - loss: 0.0707
Epoch 7/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 491us/step - accuracy: 0.9816 - loss: 0.0690
Epoch 8/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 508us/step - accuracy: 0.9827 - loss: 0.0594
Epoch 9/10
[1m316/316[

TypeError: string indices must be integers, not 'str'