In [2]:
import pandas as pd
import numpy as np
import faiss
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import average_precision_score

In [3]:
# loading the entire test embeddings
path = "../5_embeddings/cls_embeddings_time.struct_time(tm_year=2025, tm_mon=2, tm_mday=7, tm_hour=19, tm_min=40, tm_sec=22, tm_wday=4, tm_yday=38, tm_isdst=0).npy"

cls_embeddings = np.load(path)
print(cls_embeddings.shape)


(118108, 32)


In [4]:
train_df = pd.read_csv("../2_dataset/final/train_df.csv")
train_df

Unnamed: 0.1,cls,ProductCD,card4,card6,P_emaildomain,Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,4,2,1,2,-0.291883,-0.329939,0.108390,-0.145421,-0.399322,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
1,0,4,3,2,16,0.892993,0.871243,-0.359702,0.680504,-0.412094,...,-0.030054,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,0,4,2,2,1,-1.594876,-1.467121,8.134522,-0.109308,0.711822,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
3,0,4,3,2,19,-0.123148,-0.156138,-0.422421,1.487250,-0.265218,...,0.341765,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
4,0,4,3,2,16,1.611964,1.677853,-0.317889,-0.081355,-0.265218,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76765,0,4,3,2,16,-1.327921,-1.260784,-0.113217,-0.653259,-1.606254,...,1.009878,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76766,0,4,2,2,25,0.675641,0.648266,-0.075376,-0.002802,0.756523,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76767,0,4,1,1,16,0.418154,0.377752,0.150412,-1.485918,-0.226903,...,-0.227583,0.393449,0.090945,0.276274,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76768,0,4,3,2,19,0.605578,0.576883,-0.322279,-0.182963,0.577719,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142


In [5]:
train_y = pd.read_csv("../2_dataset/final/train_y_df.csv")
train_y.shape


(76770, 1)

In [6]:
test_df = pd.read_csv("../2_dataset/final/test_df.csv")
test_y = pd.read_csv("../2_dataset/final/test_y_df.csv")
test_y['isFraud'] = test_y['isFraud'].astype(np.float32)

In [7]:
val_df = pd.read_csv("../2_dataset/final/val_df.csv")
val_y = pd.read_csv("../2_dataset/final/val_y_df.csv")

In [8]:
train_y.shape, train_df.shape

((76770, 1), (76770, 182))

In [9]:
test_y.shape, test_df.shape

((23622, 1), (23622, 182))

In [10]:
val_y.shape, val_df.shape

((17716, 1), (17716, 182))

## Splitting Embeddings

In [11]:
# Compute 65% of the total rows
total_rows = cls_embeddings.shape[0]
train_size = int(0.65 * total_rows)  # 65% of 118108
test_size = int(0.8 * total_rows)

# Slice the top 65%
train_embeddings = cls_embeddings[:train_size]
val_embeddings = cls_embeddings[train_size:test_size]
test_embeddings = cls_embeddings[test_size:]

print(f"Total embeddings shape: {cls_embeddings.shape}")
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Val embeddings shape: {val_embeddings.shape}")
print(f"Test embeddings shape: {test_embeddings.shape}")

Total embeddings shape: (118108, 32)
Train embeddings shape: (76770, 32)
Val embeddings shape: (17716, 32)
Test embeddings shape: (23622, 32)


## Important features calculation using Random Forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, average_precision_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np


In [None]:
X_train = pd.read_csv("/Users/adityasingh/Desktop/DESKTOP/Reranked-Reasoned-Retrieval-Augmented-Financial-Fraud-Detection_Retard/IEEE-CIS/IEEE-CIS-CORRECTED/train_df.csv")
y_train = pd.read_csv("/Users/adityasingh/Desktop/DESKTOP/Reranked-Reasoned-Retrieval-Augmented-Financial-Fraud-Detection_Retard/IEEE-CIS/IEEE-CIS-CORRECTED/train_y_df.csv")
X_test = pd.read_csv("/Users/adityasingh/Desktop/DESKTOP/Reranked-Reasoned-Retrieval-Augmented-Financial-Fraud-Detection_Retard/IEEE-CIS/IEEE-CIS-CORRECTED/test_df.csv")
y_test = pd.read_csv("/Users/adityasingh/Desktop/DESKTOP/Reranked-Reasoned-Retrieval-Augmented-Financial-Fraud-Detection_Retard/IEEE-CIS/IEEE-CIS-CORRECTED/test_y_df.csv")
X_valid = pd.read_csv("/Users/adityasingh/Desktop/DESKTOP/Reranked-Reasoned-Retrieval-Augmented-Financial-Fraud-Detection_Retard/IEEE-CIS/IEEE-CIS-CORRECTED/val_df.csv")
y_valid = pd.read_csv("/Users/adityasingh/Desktop/DESKTOP/Reranked-Reasoned-Retrieval-Augmented-Financial-Fraud-Detection_Retard/IEEE-CIS/IEEE-CIS-CORRECTED/val_y_df.csv")

# Initialize Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    max_depth=15,  # Limit tree depth to avoid overfitting
    class_weight="balanced",  # Handle class imbalance
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_valid_pred = rf_model.predict(X_valid)
y_test_pred = rf_model.predict(X_test)

# AUC Scores
valid_auc = roc_auc_score(y_valid, rf_model.predict_proba(X_valid)[:, 1])
test_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# AUCPR Scores
valid_aucpr = average_precision_score(y_valid, rf_model.predict_proba(X_valid)[:, 1])
test_aucpr = average_precision_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# Print results
print(f"Validation AUC: {valid_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print(f"Validation AUCPR: {valid_aucpr:.4f}")
print(f"Test AUCPR: {test_aucpr:.4f}")

# Classification Reports
print("Validation Classification Report:\n", classification_report(y_valid, y_valid_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Confusion Matrix
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))


# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 15, 20]
}

# Grid Search
grid_search = GridSearchCV(RandomForestClassifier(class_weight="balanced", random_state=42), param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Parameters:", grid_search.best_params_)

# Get feature importances
importances = rf_model.feature_importances_
feature_names = np.array(X_train.columns)

# Sort and plot
sorted_idx = np.argsort(importances)[::-1][:20]  # Top 20 features
plt.figure(figsize=(10, 6))
plt.barh(feature_names[sorted_idx], importances[sorted_idx])
plt.xlabel("Feature Importance")
plt.title("Top 20 Important Features (Random Forest)")
plt.gca().invert_yaxis()
plt.show()

## Faiss Index and Similarity Search

In [12]:
def create_index(num_embeddings, dimension):
    # num_embeddings = 76770
    # dimension = 32
    index = faiss.IndexFlatL2(dimension)  # L2 similarity
    index.add(train_embeddings)  # index of pre-computed embeddings

    return index


In [13]:
def search_faiss(query_vector):
    index = create_index(76770, 32)

    # Convert PyTorch tensor to NumPy
    if isinstance(query_vector, torch.Tensor):
        query_vector = query_vector.detach().cpu().numpy()

    query_vector = query_vector.astype("float32").reshape(1, -1)
    query_vector = query_vector.astype("float32").reshape(1, -1)

    # k = 120, As best result for 120
    distances, indices = index.search(
        query_vector, k=120
    )  # by default using euclidean distance for similarity

    indices = indices.flatten()
    return distances, indices

## L2 Distance Component

In [14]:
def compute_similarity(distances, dropout=0.2):

    distances = distances.flatten()
    # Apply softmax to the negative distances
    similarities = np.exp(-distances)
    softmax_scores = similarities / np.sum(similarities)

    # Apply dropout (randomly zero out some softmax scores)
    dropout_mask = np.random.binomial(1, 1 - dropout, size=softmax_scores.shape)
    dropped_softmax_scores = softmax_scores * dropout_mask

    # Renormilizing softmax scores so that they sum to 1 again.
    final_softmax = dropped_softmax_scores / np.sum(dropped_softmax_scores)
    
    # how to weigh in the final embedding? -> weigh emebedding more if they are closer in vector space.
    return final_softmax, distances

## Mask To Drop The Dropped Out Values

In [15]:
def filter_by_mask(arr1, arr2, arr3):
    """
    Removes elements from arr2 and arr3 where corresponding indices in arr1 are zero.
    """
    mask = arr1 != 0  # Create a boolean mask where arr1 is nonzero
    return arr1[mask], arr2[mask], arr3[mask]

## Linear Trasnform, Value Component and Shap Features

In [16]:
class MLP_Wv(nn.Module):
    def __init__(self, input_dim):
        super(MLP_Wv, self).__init__()
        self.layer1 = nn.Linear(input_dim, 32)
        self.activation1 = nn.SiLU()

        self.layer2 = nn.Linear(32, 32)
        self.activation2 = nn.SiLU()
        self.dropout2 = nn.Dropout(p=0.2)

        self.layer3 = nn.Linear(32, 32)  # Fix: Output should be 32
        self.activation3 = nn.SiLU()  # Fix: Apply SiLU activation

    def forward(self, x):
        x = self.activation1(self.layer1(x))
        x = self.dropout2(self.activation2(self.layer2(x)))
        x = self.activation3(self.layer3(x))  # Fix: Apply activation & dropout
        return x

In [17]:
def compute_l1(distances):

    l1_dist = np.sqrt(distances)
    model = MLP_Wv(l1_dist.shape[0])
    # Convert input to tensor and pass it through the model
    l1_dist_tensor = torch.tensor(l1_dist, dtype=torch.float32)
    value_Wv = model(l1_dist_tensor).detach().numpy()

    return value_Wv

In [18]:
class MLP_Wy(nn.Module):
    """
    Instantiate MLP with input_dim=32 (from Wy)
    Expected output shape: (60,)
    """
    def __init__(self, input_dim):
        super(MLP_Wy, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 32)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)  # Shape (60, 1)
        return x.squeeze(1)  # Shape (60,)

In [19]:
def compute_wy(indices):

    y_i = train_y["isFraud"].loc[indices].values
    feature_target_tensor = torch.tensor(y_i, dtype=torch.long)

    # Define Wy: An embedding layer to map to 32-dim space
    embedding_dim = 32
    num_classes = 2  # Since input values are 0 or 1

    Wy = nn.Embedding(num_classes, embedding_dim)
    mlp = MLP_Wy(input_dim=embedding_dim)

    # Compute embeddings using Wy
    embeddings = Wy(feature_target_tensor)  # Shape: (60, 32)

    # Pass embeddings through MLP
    value_Wy = mlp(embeddings)  # Shape: (60,)

    return value_Wy

In [None]:
def compute_imp_features(indices):
    """
    Transform the features C14, C13, C1, and C2 using MLP_Wy
    """

    v_86 = train_df['C14'].loc[indices].values
    feature_target_tensor = torch.tensor(v_86, dtype=torch.float32).unsqueeze(-1)

    v_87 = train_df['C13'].loc[indices].values
    feature_target_tensor_1 = torch.tensor(v_87, dtype=torch.float32).unsqueeze(-1)

    v_79 = train_df['C1'].loc[indices].values
    feature_target_tensor_2 = torch.tensor(v_79, dtype=torch.float32).unsqueeze(-1)

    v_94 = train_df['C2'].loc[indices].values
    feature_target_tensor_3 = torch.tensor(v_94, dtype=torch.float32).unsqueeze(-1)

    # Convert to PyTorch tensor
    input_dim = 1  # Since each input is a single continuous value
    embedding_dim = 32
    W_feat = nn.Linear(input_dim, embedding_dim)
    mlp = MLP_Wy(input_dim=embedding_dim)

    # Process first variable
    embeddings_1 = W_feat(feature_target_tensor)  
    feat_x1 = mlp(embeddings_1)  
    feat_x1 = feat_x1.detach().numpy()
    # Process second variable
    embeddings_2 = W_feat(feature_target_tensor_1)  
    feat_x2 = mlp(embeddings_2)  
    feat_x2 = feat_x2.detach().numpy()
    # Process third variable
    embeddings_3 = W_feat(feature_target_tensor_2)  
    feat_x3 = mlp(embeddings_3)  
    feat_x3 = feat_x3.detach().numpy()
    # Process fourth variable
    embeddings_4 = W_feat(feature_target_tensor_3)  
    feat_x4 = mlp(embeddings_4)  
    feat_x4 = feat_x4.detach().numpy()


    return feat_x1, feat_x2, feat_x3, feat_x4

In [36]:
def compute_value(value_Wv, value_Wy):
    """
    Compute the dot product of value_Wv and value_Wy
    """    
    value_Wy_npy = value_Wy.detach().numpy()
    value = value_Wy_npy + value_Wv
    return value

#### Reshaping S to do S * V

In [37]:
def compute_z_in(S_x_xi, value):
    """
    z_in is the intergrated(weighted sum) result of value and similarity componenet
    """
    S_x_xi = S_x_xi.reshape(1, -1)

    numerator = np.sum(S_x_xi @ value)  # Sum the weighted contributions (scalar)
    denominator = np.sum(S_x_xi)        # Total sum of weights (scalar)
    z_in = numerator / denominator      # Weighted average as a single scalar

    return z_in


## Processing Samples

In [38]:

# test_i = test_df.iloc[10].values #shape (182,)
# query_vector = test_embeddings[10] #shape (32,)
# distances, indices = search_faiss(query_vector) # both shape (120,) and flatten
# S_x_xi, distances = compute_similarity(distances)
# S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
# value_Wv = compute_l1(distances) #shape (32,)
# value_Wy = compute_wy(indices) #shape (32,)


# feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices)
# #z_in 
# f_z_1 = compute_z_in(S_x_xi, feat_x_1)
# f_z_2 = compute_z_in(S_x_xi, feat_x2)
# f_z_3 = compute_z_in(S_x_xi, feat_x3)
# f_z_4 = compute_z_in(S_x_xi, feat_x4)

#f_z_1, f_z_2, f_z_3, f_z_4

# value = compute_value(value_Wv, value_Wy)
# value.shape

# labels = test_y['isFraud']
# labels = torch.tensor(test_y['isFraud'].values, dtype=torch.float32)
# type(labels)

# type(test_df.iloc[34].values)


#----------------Without Batch Processing----------------
# def process_samples(test_df, test_embeddings):
#     """Processes all samples and returns input tensor and labels."""
#     input_list = []
#     labels = []


#     for i in range(len(test_df)):  # Process all samples
#         test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
#         query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

#         distances, indices = search_faiss(query_embedding)
#         S_x_xi, distances = compute_similarity(distances)
#         S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
#         value_Wv = compute_l1(distances)  # shape (32,)
#         value_Wy = compute_wy(indices)  # shape (32,)
#         value = compute_value(value_Wv, value_Wy)
#         z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)

#         feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices) #v86 feature
#         f_z_in_1 = compute_z_in(S_x_xi, feat_x_1)

#         f_z_in_2 = compute_z_in(S_x_xi, feat_x2)

#         f_z_in_3 = compute_z_in(S_x_xi, feat_x3)

#         f_z_in_4 = compute_z_in(S_x_xi, feat_x4)

#         # Append only test_i (input_emb) and z_in (weighted_avg), not query_embedding
#         z_in = torch.tensor(z_in, dtype=torch.float32)
#         f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
#         f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
#         f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
#         f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

#         input_list.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))

#         labels.append(torch.tensor(test_y.iloc[i]['isFraud'], dtype=torch.float32))  # Assuming label is in df



#     return input_list, labels


In [39]:
def process_samples(train_df, train_embeddings, batch_size=1000, delay=0.5):
    """
    Processes samples in batches to prevent memory overflow.
    """
    input_list = []
    labels = []

    total_samples = len(train_df)
    num_batches = (total_samples + batch_size - 1) // batch_size  # Compute number of batches

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, total_samples)

        batch_inputs = []
        batch_labels = []

        for i in range(start_idx, end_idx):  # Process each sample in batch
            test_i = torch.tensor(train_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
            query_embedding = torch.tensor(train_embeddings[i], dtype=torch.float32)  # shape (32,)

            distances, indices = search_faiss(query_embedding)
            S_x_xi, distances = compute_similarity(distances)
            S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
            value_Wv = compute_l1(distances)  # shape (32,)
            value_Wy = compute_wy(indices)  # shape (32,)
            value = compute_value(value_Wv, value_Wy)
            z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)

            feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices)
            f_z_in_1 = compute_z_in(S_x_xi, feat_x_1)
            f_z_in_2 = compute_z_in(S_x_xi, feat_x2)
            f_z_in_3 = compute_z_in(S_x_xi, feat_x3)
            f_z_in_4 = compute_z_in(S_x_xi, feat_x4)

            # Convert to tensors
            z_in = torch.tensor(z_in, dtype=torch.float32)
            f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
            f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
            f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
            f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

            batch_inputs.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))
            batch_labels.append(torch.tensor(train_y.iloc[i]['isFraud'], dtype=torch.float32))

        input_list.extend(batch_inputs)
        labels.extend(batch_labels)
        
        time.sleep(delay)  # Add delay to avoid system overload
    
    return input_list, labels


In [40]:
def val_process_samples(val_df, val_embeddings, batch_size=1000, delay=0.5):
    """
    Processes samples in batches to prevent memory overflow.
    """
    input_list = []
    labels = []

    total_samples = len(val_df)
    num_batches = (total_samples + batch_size - 1) // batch_size  # Compute number of batches

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, total_samples)

        batch_inputs = []
        batch_labels = []

        for i in range(start_idx, end_idx):  # Process each sample in batch
            test_i = torch.tensor(val_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
            query_embedding = torch.tensor(val_embeddings[i], dtype=torch.float32)  # shape (32,)

            distances, indices = search_faiss(query_embedding)
            S_x_xi, distances = compute_similarity(distances)
            S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
            value_Wv = compute_l1(distances)  # shape (32,)
            value_Wy = compute_wy(indices)  # shape (32,)
            value = compute_value(value_Wv, value_Wy)
            z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)

            feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices) #v86 feature
            f_z_in_1 = compute_z_in(S_x_xi, feat_x_1)
            f_z_in_2 = compute_z_in(S_x_xi, feat_x2)
            f_z_in_3 = compute_z_in(S_x_xi, feat_x3)
            f_z_in_4 = compute_z_in(S_x_xi, feat_x4)

            # Convert to tensors
            z_in = torch.tensor(z_in, dtype=torch.float32)
            f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
            f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
            f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
            f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

            batch_inputs.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))
            batch_labels.append(torch.tensor(val_y.iloc[i]['isFraud'], dtype=torch.float32))

        input_list.extend(batch_inputs)
        labels.extend(batch_labels)
        
        time.sleep(delay)  # Add delay to avoid system overload
    
    return input_list, labels


## Model Training

In [41]:
class ThreeBlockModel(nn.Module):
    def __init__(self, input_emb_dim, hidden_dim=32, dropout_prob=0.2):
        super(ThreeBlockModel, self).__init__()

        self.input_dim = input_emb_dim + 4  # Adding 2 for weighted_avg and Shap features

        self.block1 = nn.Sequential(
            nn.LayerNorm(self.input_dim),
            nn.Linear(self.input_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block2 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block3 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, input_emb, weighted_avg, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4):
        # Ensure correct shape for scalar inputs
        weighted_avg = weighted_avg.unsqueeze(-1)  
        f_z_in = f_z_in.unsqueeze(-1)  
        f_z_in_2 = f_z_in_2.unsqueeze(-1)
        f_z_in_3 = f_z_in_3.unsqueeze(-1)
        f_z_in_4 = f_z_in_4.unsqueeze(-1)

        # Concatenate all inputs
        combined = torch.cat([input_emb, weighted_avg, f_z_in, f_z_in_3, f_z_in_4], dim=-1)

        # Pass through MLP blocks
        x = self.block1(combined)
        x = self.block2(x)
        x = self.block3(x)
        x = self.output_layer(x)
        return x


In [42]:
def train_model(test_df, test_embeddings, model, optimizer, criterion, batch_size=256, epochs=25):
    
    model.train()
    
    input_list, labels = process_samples(train_df, train_embeddings)
    val_input_list, val_labels = val_process_samples(val_df, val_embeddings)


    # Create DataLoader with `f_z_in`
    train_dataset = TensorDataset(
        torch.stack([item[0] for item in input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in input_list]),  # z_in (weighted_avg)
        torch.stack([item[2] for item in input_list]),  # f_z_in (new scalar input)
        torch.stack([item[3] for item in input_list]),  # f_z_in_2 (new scalar input)
        torch.stack([item[4] for item in input_list]),  # f_z_in_3 (new scalar input)
        torch.stack([item[5] for item in input_list]),  # f_z_in_4 (new scalar input)
        torch.stack(labels)  # Labels
    )

    val_dataset = TensorDataset(
        torch.stack([item[0] for item in val_input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in val_input_list]),  # z_in (weighted_avg)
        torch.stack([item[2] for item in val_input_list]),  # f_z_in (new scalar input)
        torch.stack([item[3] for item in val_input_list]),  # f_z_in_2 (new scalar input)
        torch.stack([item[4] for item in val_input_list]),  # f_z_in_3 (new scalar input)
        torch.stack([item[5] for item in val_input_list]),  # f_z_in_4 (new scalar input)
        torch.stack(val_labels)  # Labels
    )
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



    # Training loop
    for epoch in range(epochs):
        epoch_loss = 0  # Track loss for each epoch
        all_targets = []
        all_outputs = []

        for batch in train_dataloader:
            input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4, target = batch
            target = target.unsqueeze(-1)  # Make target shape (batch_size, 1)

            optimizer.zero_grad()
            output = model(input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4)  # Pass f_z_in to model

            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate loss for epoch
            
            # Collect predictions & targets for AUCPR
            all_outputs.append(output.detach().cpu())  # Move to CPU to avoid memory issues
            all_targets.append(target.detach().cpu())

        # Compute AUCPR at the end of the epoch
        all_outputs = torch.cat(all_outputs).numpy()
        all_targets = torch.cat(all_targets).numpy()

        aucpr = average_precision_score(all_targets, all_outputs)

        # VALIDATION Step
        model.eval()
        val_loss = 0
        all_val_targets = []
        all_val_outputs = []

        with torch.no_grad():  # No gradient computation for validation
            for batch in val_dataloader:
                input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4, target = batch
                target = target.unsqueeze(-1)

                output = model(input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4)
                loss = criterion(output, target)

                val_loss += loss.item()
                
                all_val_outputs.append(output.cpu())
                all_val_targets.append(target.cpu())

        # Compute AUCPR for validation set
        all_val_outputs = torch.cat(all_val_outputs).numpy()
        all_val_targets = torch.cat(all_val_targets).numpy()
        val_aucpr = average_precision_score(all_val_targets, all_val_outputs)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {epoch_loss/len(train_dataloader):.4f}, Train AUCPR: {aucpr:.4f}, "
              f"Val Loss: {val_loss/len(val_dataloader):.4f}, Val AUCPR: {val_aucpr:.4f}")

# Model, optimizer, and loss function
input_emb_dim = 182  # Assuming this based on test_df features
model = ThreeBlockModel(input_emb_dim=input_emb_dim)
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss

# Call training loop (assuming test_df and test_embeddings are available)
train_model(train_df, train_embeddings, model, optimizer, criterion, epochs=100)


Epoch [1/100], Train Loss: 0.2663, Train AUCPR: 0.0416, Val Loss: 0.1347, Val AUCPR: 0.2132
Epoch [2/100], Train Loss: 0.1319, Train AUCPR: 0.2240, Val Loss: 0.1249, Val AUCPR: 0.2882
Epoch [3/100], Train Loss: 0.1227, Train AUCPR: 0.2833, Val Loss: 0.1178, Val AUCPR: 0.3243
Epoch [4/100], Train Loss: 0.1183, Train AUCPR: 0.3123, Val Loss: 0.1156, Val AUCPR: 0.3345
Epoch [5/100], Train Loss: 0.1161, Train AUCPR: 0.3262, Val Loss: 0.1143, Val AUCPR: 0.3406
Epoch [6/100], Train Loss: 0.1148, Train AUCPR: 0.3351, Val Loss: 0.1135, Val AUCPR: 0.3448
Epoch [7/100], Train Loss: 0.1139, Train AUCPR: 0.3421, Val Loss: 0.1129, Val AUCPR: 0.3487
Epoch [8/100], Train Loss: 0.1131, Train AUCPR: 0.3478, Val Loss: 0.1125, Val AUCPR: 0.3522
Epoch [9/100], Train Loss: 0.1124, Train AUCPR: 0.3530, Val Loss: 0.1121, Val AUCPR: 0.3556
Epoch [10/100], Train Loss: 0.1119, Train AUCPR: 0.3578, Val Loss: 0.1118, Val AUCPR: 0.3584
Epoch [11/100], Train Loss: 0.1113, Train AUCPR: 0.3624, Val Loss: 0.1115, Val 

## Inference

In [43]:
def test_process_samples(test_df, test_embeddings, batch_size=1000, delay=0.5):
    """
    Processes samples in batches to prevent memory overflow.
    """
    input_list = []
    labels = []

    total_samples = len(test_df)
    num_batches = (
        total_samples + batch_size - 1
    ) // batch_size  # Compute number of batches

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, total_samples)

        batch_inputs = []
        batch_labels = []

        for i in range(start_idx, end_idx):  # Process each sample in batch
            test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
            query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

            distances, indices = search_faiss(query_embedding)
            similarity, distances = compute_similarity(distances)
            similarity, indices, distances = filter_by_mask(similarity, indices, distances)
            value_Wv = compute_l1(distances)  # shape (32,)
            value_Wy = compute_wy(indices)  # shape (32,)
            value = compute_value(value_Wv, value_Wy)
            z_in = compute_z_in(similarity, value)  # Scalar (single value)

            feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(
                indices=indices
            )  # v86 feature
            f_z_in_1 = compute_z_in(similarity, feat_x_1)
            f_z_in_2 = compute_z_in(similarity, feat_x2)
            f_z_in_3 = compute_z_in(similarity, feat_x3)
            f_z_in_4 = compute_z_in(similarity, feat_x4)

            # Convert to tensors
            z_in = torch.tensor(z_in, dtype=torch.float32)
            f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
            f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
            f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
            f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

            batch_inputs.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))
            batch_labels.append(torch.tensor(test_y.iloc[i]["isFraud"], dtype=torch.float32))

        input_list.extend(batch_inputs)
        labels.extend(batch_labels)

        time.sleep(delay)  # Add delay to avoid system overload

    return input_list, labels


In [33]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def inference(model, test_df, test_embeddings, batch_size=256):
    """
    Perform inference on test data using a trained model.
    
    Args:
        model (nn.Module): Trained model.
        test_df (pd.DataFrame): Your test dataframe.
        test_embeddings (np.ndarray or torch.Tensor): Precomputed embeddings for test.
        batch_size (int): Batch size for inference.

    Returns:
        all_outputs (np.ndarray): Model outputs (probabilities).
        all_labels (np.ndarray): Ground truth labels.
    """
    test_input_list, test_input_labels = test_process_samples(test_df=test_df, test_embeddings=test_embeddings)

    model.eval()

    test_dataset = TensorDataset(
        torch.stack([item[0] for item in test_input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in test_input_list]),  # z_in (weighted_avg)
        torch.stack([item[2] for item in test_input_list]),  # f_z_in (new scalar input)
        torch.stack([item[3] for item in test_input_list]),  # f_z_in_2 (new scalar input)
        torch.stack([item[4] for item in test_input_list]),  # f_z_in_3 (new scalar input)
        torch.stack([item[5] for item in test_input_list]),  # f_z_in_4 (new scalar input)
        torch.stack(test_input_labels)
    )
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Lists to store outputs and labels
    all_outputs = []
    all_labels = []

    # No gradient updates during inference
    with torch.no_grad():
        for batch in test_dataloader:
            input_emb, z_in, f1, f2, f3, f4, labels = batch
            outputs = model(input_emb, z_in, f1, f2, f3, f4)
            
            # Move outputs and labels to CPU, then store
            all_outputs.append(outputs.cpu())
            all_labels.append(labels.cpu())

    # Concatenate everything into a single array/tensor
    all_outputs = torch.cat(all_outputs).numpy()
    all_labels = torch.cat(all_labels).numpy()

    return all_outputs, all_labels

all_test_outputs, all_test_labels = inference(model, test_df, test_embeddings, batch_size=256)

# Optionally, compute metrics (e.g., AUCPR) on your test set:
from sklearn.metrics import average_precision_score
test_aucpr = average_precision_score(all_test_labels, all_test_outputs)
print("Test AUCPR:", test_aucpr)

# Or create a final binary prediction if desired:
binary_predictions = (all_test_outputs >= 0.5).astype(int)

Test AUCPR: 0.40405246696174973


In [44]:
import numpy as np

# Count occurrences of each unique value (0s and 1s)
unique_values, counts = np.unique(binary_predictions, return_counts=True)

# Print results
for value, count in zip(unique_values, counts):
    print(f"Value {value}: {count} occurrences")

Value 0: 23262 occurrences
Value 1: 360 occurrences
