In [12]:
import pandas as pd
import nltk
import string
import re
import torch
import torch.nn as nn
import torch.optim as optim


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/lukemcguinness/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [13]:
# apply same logic as in analysis, but let the model learn from the unapprovals - no need to filter

data = pd.read_csv("loan_data.csv")
data['Approval'].replace(['Approved', 'Rejected'], [1,0], inplace=True)
data['Employment_Status'].replace(['employed', 'unemployed'], [1,0], inplace=True)

def preprocess(text):
    text = text.lower() 
    text = text.strip()
    text = re.compile(r'[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d',' ',text) 
    return text

def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

wl = WordNetLemmatizer()
 
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) 
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in word_pos_tags] 
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

data['clean_text'] = data['Text'].apply(lambda x: finalpreprocess(x))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Approval'].replace(['Approved', 'Rejected'], [1,0], inplace=True)
  df['Approval'].replace(['Approved', 'Rejected'], [1,0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Employment_Status'].replace(['employed', 'unemployed'], [1,0], inplace=True)
  df['Emp

In [14]:
X_text = data['clean_text']
y = data['Approval']
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

# convert to numbers and only take 500 most impportant words ** possibly increase/decrease **
tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray() # array to be handles by tensor/torch
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray() # array to be handles by tensor/torch

# Numeric features
numeric_cols = ['Income', 'Credit_Score', 'Loan_Amount', 'DTI_Ratio', 'Employment_Status']
X_train_num = data.loc[X_train.index][numeric_cols]
X_test_num = data.loc[X_test.index][numeric_cols]

# 0 mean and unit variance for input into NN
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

# now the model is more accuracte with the income included
# scaler_reduced = StandardScaler()
# X_train_num_scaled_reduced = scaler_reduced.fit_transform(X_train_num.drop(['Income'], axis=1))
# X_num_test_scaled_reduced = scaler_reduced.transform(X_test_num.drop(['Income'], axis=1))

**PyTorch Model**

In [15]:
# X_train_num_scaled_reduced = scaler.fit_transform(X_train_num.drop(['Loan_Amount'], axis=1))# 
# ^^ use to test without loan_amount ^^

class CombinedTorchModel(nn.Module):
    # initialize 3 layers
    def __init__(self, text_dim, num_dim):
        super().__init__()
        self.text_layer = nn.Linear(text_dim, 64)
        self.num_layer = nn.Linear(num_dim, 32)
        self.combined_layer = nn.Linear(96, 1)

    # pass into model
    def forward(self, text_input, num_input):
        x_text = torch.sigmoid(self.text_layer(text_input)) # more accurate with sigmoid over relu - tanh wasn't as good
        x_num = torch.relu(self.num_layer(num_input))
        x = torch.cat((x_text, x_num), dim=1) # concatenates the outputs from the two branches
        return torch.sigmoid(self.combined_layer(x)) # output between 0 and 1
    
X_text_train_torch = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_num_train_torch = torch.tensor(X_train_num_scaled, dtype=torch.float32)
# use unsqueeze(1) to match what model expects
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1) 

X_text_test_torch = torch.tensor(X_test_tfidf, dtype=torch.float32)
X_num_test_torch = torch.tensor(X_test_num_scaled, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

In [16]:
model = CombinedTorchModel(X_train_tfidf.shape[1], X_train_num_scaled.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# bundle input and target tensors so we can iterate over it in each epoch
train_dataset = TensorDataset(X_text_train_torch, X_num_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # split and shuffle

for epoch in range(10):
    model.train()
    # pass through training set each epoch
    for text_batch, num_batch, label_batch in train_loader:
        optimizer.zero_grad()
        output = model(text_batch, num_batch)
        loss = criterion(output, label_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_output = model(X_text_test_torch, X_num_test_torch)
        val_loss = criterion(val_output, y_test_torch)
    print(f"Epoch {epoch+1}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

model.eval()
with torch.no_grad():
    y_pred_probs = model(X_text_test_torch, X_num_test_torch).numpy()
    y_pred = (y_pred_probs > 0.5).astype(int)

# Apply post-model auto-rejection rules
X_test_num_reset = X_test_num.reset_index(drop=True)
for i, row in X_test_num_reset.iterrows():
    if row['Loan_Amount'] >= 120000 or row['Credit_Score'] < 550 or row['DTI_Ratio'] > 50:
        y_pred[i] = 0

print("\nPyTorch Classification Report:")
print(classification_report(y_test, y_pred))


accuracy = accuracy_score(y_test, y_pred)
print(f"PyTorch Accuracy: {accuracy:.3f}")

Epoch 1, Train Loss: 0.1652, Val Loss: 0.1271
Epoch 2, Train Loss: 0.1307, Val Loss: 0.0851
Epoch 3, Train Loss: 0.0870, Val Loss: 0.0667
Epoch 4, Train Loss: 0.0681, Val Loss: 0.0576
Epoch 5, Train Loss: 0.0372, Val Loss: 0.0493
Epoch 6, Train Loss: 0.0590, Val Loss: 0.0438
Epoch 7, Train Loss: 0.0171, Val Loss: 0.0397
Epoch 8, Train Loss: 0.0037, Val Loss: 0.0366
Epoch 9, Train Loss: 0.0560, Val Loss: 0.0351
Epoch 10, Train Loss: 0.0206, Val Loss: 0.0318

PyTorch Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4047
           1       0.99      0.97      0.98       753

    accuracy                           0.99      4800
   macro avg       0.99      0.98      0.99      4800
weighted avg       0.99      0.99      0.99      4800

PyTorch Accuracy: 0.994


**TensorFlow Model**

In [17]:
input_text = Input(shape=(X_train_tfidf.shape[1],))
input_num = Input(shape=(X_train_num_scaled.shape[1],))

text_branch = Dense(64, activation='sigmoid')(input_text)
text_branch = Dense(32, activation='sigmoid')(text_branch) # add another layer

num_branch = Dense(32, activation='relu')(input_num)
num_branch = Dense(16, activation='relu')(num_branch) # add another layer

combined = Concatenate()([text_branch, num_branch])
combined = Dense(16, activation='sigmoid')(combined) # add another layer
output = Dense(1, activation='sigmoid')(combined)

model_tf = Model(inputs=[input_text, input_num], outputs=output)
model_tf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# add validation split to see if we're overfitting
model_tf.fit([X_train_tfidf, X_train_num_scaled], y_train, validation_split=0.1, epochs=10, batch_size=32, verbose=1)

y_pred_tf = (model_tf.predict([X_test_tfidf, X_test_num_scaled]) > 0.5).astype(int)

# Apply post-model auto-rejection rules
X_test_num_reset = X_test_num.reset_index(drop=True)
for i, row in X_test_num_reset.iterrows():
    if row['Loan_Amount'] >= 120000 or row['Credit_Score'] < 550 or row['DTI_Ratio'] > 50:
        y_pred_tf[i] = 0

print("\nTensorFlow Classification Report:")
print(classification_report(y_test, y_pred_tf))

accuracy_tf = accuracy_score(y_test, y_pred_tf)
print(f"TensorFlow Accuracy: {accuracy_tf:.3f}")

roc_auc = roc_auc_score(y_test, model_tf.predict([X_test_tfidf, X_test_num_scaled]))
print(f"ROC AUC: {roc_auc:.3f}")

Epoch 1/10
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 921us/step - accuracy: 0.7580 - loss: 0.4409 - val_accuracy: 0.9771 - val_loss: 0.0832
Epoch 2/10
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 680us/step - accuracy: 0.9824 - loss: 0.0712 - val_accuracy: 0.9875 - val_loss: 0.0450
Epoch 3/10
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 662us/step - accuracy: 0.9876 - loss: 0.0392 - val_accuracy: 0.9854 - val_loss: 0.0393
Epoch 4/10
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 636us/step - accuracy: 0.9907 - loss: 0.0296 - val_accuracy: 0.9859 - val_loss: 0.0307
Epoch 5/10
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 602us/step - accuracy: 0.9921 - loss: 0.0221 - val_accuracy: 0.9880 - val_loss: 0.0283
Epoch 6/10
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 726us/step - accuracy: 0.9934 - loss: 0.0188 - val_accuracy: 0.9901 - val_loss: 0.0224
Epoch 7/10
[1m5