In [4]:
!pip install transformers datasets torch scikit-learn

Collecting torch
  Downloading torch-2.7.0-cp310-cp310-win_amd64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading torch-2.7.0-cp310-cp310-win_amd64.whl (212.5 MB)
   ---------------------------------------- 0.0/212.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/212.5 MB ? eta -:--:--
   ---------------------------------------- 1.0/212.5 MB 2.6 MB/s eta 0:01:20
   ---------------------------------------- 1.6/212.5 MB 2.8 MB/s eta 0:01:16
   ----------------

In [1]:
#- Low-risk borrowers (e.g., responsible financial behavior)
#- Medium-risk borrowers (e.g., occasional payment struggles)
#- High-risk borrowers (e.g., history of defaults or financial instability)
# We’ll modify synthetic applicant statements accordingly:


In [9]:
import pandas as pd
import numpy as np

# Define possible applicant statements based on financial risk
low_risk_statements = [
    "I always pay my bills on time and have a stable income.",
    "My savings are strong, and I manage expenses carefully.",
    "I have a long history of successfully repaying loans."
]

medium_risk_statements = [
    "I sometimes struggle with payments but always catch up.",
    "My income is stable, but unexpected expenses can be challenging.",
    "I've taken multiple loans before and repaid most of them on time."
]

high_risk_statements = [
    "I have missed payments in the past and struggled with debt.",
    "I recently lost my job and have outstanding loans.",
    "My credit score dropped significantly last year due to financial hardship."
]

# Assign statements based on the default probability
np.random.seed(42)
risk_categories = np.random.choice(['low', 'medium', 'high'], 10000, p=[0.6, 0.3, 0.1])

applicant_statements = []
for risk in risk_categories:
    if risk == 'low':
        applicant_statements.append(np.random.choice(low_risk_statements))
    elif risk == 'medium':
        applicant_statements.append(np.random.choice(medium_risk_statements))
    else:
        applicant_statements.append(np.random.choice(high_risk_statements))

# Load dataset and apply new statements
df = pd.read_csv("applicants.csv")
df["applicant_statement"] = applicant_statements

# Save refined dataset
df.to_csv("applicants_updated.csv", index=False)
print("Updated dataset saved as 'applicants_updated.csv'")

Updated dataset saved as 'applicants_updated.csv'


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load updated dataset
df = pd.read_csv("applicants_updated.csv")

# Define tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Encode applicant statements
X_texts = df["applicant_statement"].tolist()
y_labels = df["default"].values

# Tokenize dataset
tokens = tokenizer(X_texts, padding=True, truncation=True, return_tensors="pt", max_length=50)
X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(tokens["input_ids"], y_labels, test_size=0.2, random_state=42)

# Define dataset class
class LoanDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return {"input_ids": self.inputs[idx], "labels": torch.tensor(self.labels[idx])}

# Create data loaders
train_data = LoanDataset(X_train_tokens, y_train)
test_data = LoanDataset(X_test_tokens, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

# Load pretrained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tune BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Fine-tune for 3 epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}: Loss = {total_loss:.4f}")

# Save fine-tuned model
model.save_pretrained("bert_loan_default_model")
print("Fine-tuned BERT model saved!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss = 125.9981
Epoch 2: Loss = 124.8095
Epoch 3: Loss = 124.6209
Fine-tuned BERT model saved!


In [11]:
def extract_bert_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=50)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.logits.numpy()  # Use BERT classifier outputs as embeddings

In [8]:
df["bert_embedding"] = df["applicant_statement"].apply(lambda x: extract_bert_embedding(str(x)))
bert_features = np.vstack(df["bert_embedding"].values)

# Merge embeddings into final dataset
X_final = np.hstack((df.drop(columns=["applicant_statement"]).to_numpy(), bert_features))

# Save final dataset with BERT features
np.save("X_final.npy", X_final)
np.save("y_final.npy", y_labels)

print("Final dataset saved with BERT embeddings!")

KeyboardInterrupt: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_labels, test_size=0.2, random_state=42)

for lr in learning_rates:
    for lambda_reg in lambda_values:
        print(f"Training Logistic Regression with Fine-Tuned BERT → lr={lr}, lambda={lambda_reg}")
        w, losses = batch_gradient_descent(X_train, y_train, lr, lambda_reg, epochs=1000)
        results[(lr, lambda_reg)] = (w, losses)

In [None]:
from sklearn.base import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score


for (lr, lambda_reg), (w, _) in results.items():
    y_pred_test = predict(X_test, w)

    accuracy = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test)
    recall = recall_score(y_test, y_pred_test)
    roc_auc = roc_auc_score(y_test, y_pred_test)

    print(f"Fine-Tuned BERT Model → lr={lr}, lambda={lambda_reg}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, ROC-AUC: {roc_auc:.4f}\n")