In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd 
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import class_weight
from torch.utils.data import WeightedRandomSampler
from torch.optim.lr_scheduler import StepLR

In [40]:
# Define the Neural Network Model
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

# Function to preprocess data (convert to tensors)
def preprocess_data(train_data, batch_size=32):
    # remove null value 
    train.rename(columns={'bad_flag': 'target'}, inplace = True)
    train.dropna(subset=['target'], inplace=True)
    train['emp_length'] =train['emp_length'].fillna('UNKNOWN')
    train['percent_bc_gt_75'] = train['percent_bc_gt_75'].fillna(-1)
    train['mths_since_recent_inq'] = train['mths_since_recent_inq'].fillna(-1)
    train['revol_util'] = train['revol_util'].fillna(-1)
    train['total_bc_limit'] = train['total_bc_limit'].fillna(-1)
    train['mths_since_last_major_derog'] = train['mths_since_last_major_derog'].fillna(-1)
    train['tot_hi_cred_lim'] = train['tot_hi_cred_lim'].fillna(-1)
    train['bc_util'] = train['bc_util'].fillna(-1)
    train['tot_cur_bal'] = train['tot_cur_bal'].fillna(-1)

    train['desc_clean'] = train['desc'].fillna('').apply(clean_text)

    train['desc_clean'] = train['desc'].fillna('')

    model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight version of BERT for sentence embeddings

    # Generate embeddings for each sentence
    embeddings = model.encode(train['desc_clean'].tolist(), show_progress_bar=True)


    # Check the shape of the embeddings
    print(embeddings.shape)  # Output: (num_samples, embedding_dimension)
    embedding_df = pd.DataFrame(embeddings, columns=[f"emb_{i}" for i in range(embeddings.shape[1])])

    train_with_embeddings = pd.concat([train.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)
    train_with_embeddings.drop(['desc'],axis = 1, inplace = True)   
    
    
    exclude_cols = ['id', 'member_id']  
    target_col = 'target'
    # Find columns with missing values
    columns_with_nan = train_with_embeddings.columns[train_with_embeddings.isnull().any()]

    # Print the column names
    print("Columns with missing values:")
    print(columns_with_nan)
    features = train_with_embeddings.drop(columns=exclude_cols+ [target_col], errors='ignore')
    target = train_with_embeddings[target_col]
    
    # Encode non-numeric columns
    non_numeric_cols = features.select_dtypes(include=['object', 'category']).columns
    label_encoder = LabelEncoder()
    for col in non_numeric_cols:
        features[col] = label_encoder.fit_transform(features[col].astype(str))
    train_data, val_data, train_labels, val_labels = train_test_split(
    features, target, test_size=0.2, random_state=78)
    
    # Scale data
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    val_data = scaler.transform(val_data)
    # Convert target to a single binary column
    train_labels = train_labels.astype(int)  # Ensure target is integer
    val_labels = val_labels.astype(int)
    X_train = torch.tensor(train_data, dtype=torch.float32)
    y_train = torch.tensor(train_labels.values, dtype=torch.float32).unsqueeze(1)
    X_val = torch.tensor(val_data, dtype=torch.float32)
    y_val = torch.tensor(val_labels.values, dtype=torch.float32).unsqueeze(1)

    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    input_size = X_train.shape[1]
    print('Finish preprocessing')

    return train_loader, val_loader, input_size, scaler

# Function to train the model with early stopping
def train_model(train_loader, val_loader, input_size, hidden_size=64, lr=0.01, patience=5, num_epochs=50):
    print('Start Training')
    # Model, Loss, Optimizer
    model = NeuralNet(input_size, hidden_size)

    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Learning Rate Scheduler

    # Early Stopping Parameters
    patience = 5  # Number of epochs to wait for improvement
    best_roc_auc = 0.0
    epochs_no_improve = 0

    # Training Loop
    num_epochs = 50
    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation Phase
        model.eval()
        val_preds = []
        val_targets = []
        val_loss = 0

        with torch.no_grad():
            for val_X, val_y in val_loader:
                val_outputs = model(val_X)
                loss = criterion(val_outputs, val_y)
                val_loss += loss.item()

                val_preds.extend(val_outputs.squeeze().cpu().numpy())
                val_targets.extend(val_y.squeeze().cpu().numpy())

        # Calculate ROC-AUC
        roc_auc = roc_auc_score(val_targets, val_preds)

        # Step the scheduler
        scheduler.step()

        # Print metrics
        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, "
              f"ROC-AUC: {roc_auc:.4f}, "
              f"Learning Rate: {scheduler.get_last_lr()[0]:.6f}")

        # Early Stopping Check
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            break

    # Load the Best Model
    model.load_state_dict(torch.load('best_model.pth'))

    # Final ROC-AUC
    model.eval()
    final_preds = []
    final_targets = []

    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_outputs = model(val_X)
            final_preds.extend(val_outputs.squeeze().cpu().numpy())
            final_targets.extend(val_y.squeeze().cpu().numpy())

    final_roc_auc = roc_auc_score(final_targets, final_preds)
    print(f"Final Validation ROC-AUC: {final_roc_auc:.4f}")
    
    return model

# Function to predict on test data
def predict(model, test_data):
    model.eval()
    X_test = torch.tensor(test_data, dtype=torch.float32)
    test_loader = DataLoader(X_test, batch_size=32, shuffle=False)

    predictions = []
    with torch.no_grad():
        for batch_X in test_loader:
            outputs = model(batch_X)
            predictions.extend(outputs.squeeze().cpu().numpy())

    return np.array(predictions)

# Function to load best model
def load_best_model(input_size, hidden_size, model_path='best_model.pth'):
    model = NeuralNet(input_size, hidden_size)
    model.load_state_dict(torch.load(model_path))
    return model



# Custom cleaning function with lemmatization
def clean_text(text):
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'Borrower added on \d{2}/\d{2}/\d{2}', '', text)  # Remove "Borrower added on XX/XX/XX"
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words('english'))  # Stopwords
    custom_stopwords = ['also', 'one', 'thank', 'added', 'help']  # Custom stopwords
    stop_words.update(custom_stopwords)

    # Lemmatize words and remove stopwords
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

def preprocess_test_data(test_data, scaler, exclude_cols=['id', 'member_id'], text_column='desc'):
    # Rename columns if needed
    test_data.rename(columns={'bad_flag': 'target'}, inplace=True)

    # Handle missing values in the test data
    test_data['emp_length'] = test_data['emp_length'].fillna('UNKNOWN')
    test_data['percent_bc_gt_75'] = test_data['percent_bc_gt_75'].fillna(-1)
    test_data['mths_since_recent_inq'] = test_data['mths_since_recent_inq'].fillna(-1)
    test_data['revol_util'] = test_data['revol_util'].fillna(-1)
    test_data['total_bc_limit'] = test_data['total_bc_limit'].fillna(-1)
    test_data['mths_since_last_major_derog'] = test_data['mths_since_last_major_derog'].fillna(-1)
    test_data['tot_hi_cred_lim'] = test_data['tot_hi_cred_lim'].fillna(-1)
    test_data['bc_util'] = test_data['bc_util'].fillna(-1)
    test_data['tot_cur_bal'] = test_data['tot_cur_bal'].fillna(-1)

    # Generate embeddings for the text column
    test_data['desc_clean'] = test_data[text_column].fillna('').apply(clean_text)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(test_data['desc_clean'].tolist(), show_progress_bar=True)
    embedding_df = pd.DataFrame(embeddings, columns=[f"emb_{i}" for i in range(embeddings.shape[1])])
    test_with_embeddings = pd.concat([test_data.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)

    # Drop unnecessary columns
    test_with_embeddings.drop(columns=exclude_cols + [text_column, 'target'], errors='ignore', inplace=True)

    # Encode non-numeric columns
    non_numeric_cols = test_with_embeddings.select_dtypes(include=['object', 'category']).columns
    label_encoder = LabelEncoder()
    for col in non_numeric_cols:
        test_with_embeddings[col] = label_encoder.fit_transform(test_with_embeddings[col].astype(str))

    # Scale the test data using the scaler fitted on the training data
    test_scaled = scaler.transform(test_with_embeddings)

    return test_scaled



# Wrap it all together
def main(train_data, test_data):
    # Preprocess data
    train_loader, val_loader, input_size, scaler= preprocess_data(train_data)

    model = train_model(train_loader, val_loader, input_size)

    # Load the best model
    best_model = load_best_model(input_size, hidden_size=64)

    # Preprocess test data
    test_scaled = preprocess_test_data(test_data, scaler=scaler)  # Pass the scaler used for training

    # Predict on test data
    test_predictions = predict(best_model, test_scaled)

    return test_predictions


In [41]:
train_data = pd.read_csv('./training_loan_data.csv',header = 1)
test_data = pd.read_csv('./testing_loan_data.csv')

test_predictions = main(train_data, test_data)

# Save predictions
import pandas as pd
submission = pd.DataFrame({'prediction': test_predictions})
submission.to_csv('test_predictions.csv', index=False)

print("Predictions saved to 'test_predictions.csv'")

  test_data = pd.read_csv('./testing_loan_data.csv')


Batches:   0%|          | 0/5921 [00:00<?, ?it/s]

(189457, 384)
Columns with missing values:
Index([], dtype='object')
Finish preprocessing
Start Training
Epoch [1/50], Train Loss: 0.2659, Val Loss: 0.2542, ROC-AUC: 0.6308, Learning Rate: 0.010000
Epoch [2/50], Train Loss: 0.2601, Val Loss: 0.2511, ROC-AUC: 0.6438, Learning Rate: 0.010000
Epoch [3/50], Train Loss: 0.2561, Val Loss: 0.2619, ROC-AUC: 0.6522, Learning Rate: 0.010000
Epoch [4/50], Train Loss: 0.2533, Val Loss: 0.2590, ROC-AUC: 0.6425, Learning Rate: 0.010000
Epoch [5/50], Train Loss: 0.2509, Val Loss: 0.2614, ROC-AUC: 0.6161, Learning Rate: 0.001000
Epoch [6/50], Train Loss: 0.2393, Val Loss: 0.2456, ROC-AUC: 0.6757, Learning Rate: 0.001000
Epoch [7/50], Train Loss: 0.2365, Val Loss: 0.2440, ROC-AUC: 0.6768, Learning Rate: 0.001000
Epoch [8/50], Train Loss: 0.2349, Val Loss: 0.2457, ROC-AUC: 0.6784, Learning Rate: 0.001000
Epoch [9/50], Train Loss: 0.2330, Val Loss: 0.2451, ROC-AUC: 0.6754, Learning Rate: 0.001000
Epoch [10/50], Train Loss: 0.2316, Val Loss: 0.2485, ROC-A

  model.load_state_dict(torch.load('best_model.pth'))
  model.load_state_dict(torch.load(model_path))


Batches:   0%|          | 0/3204 [00:00<?, ?it/s]

Predictions saved to 'test_predictions.csv'
