In [1]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import faiss


Loading the embeddings that have been generated using the SAINT encoder as detailed in the paper. [G. Somepalli 2021] 

In [10]:
saint_embeddings_path = "../5_embeddings/cls_embeddings_time.struct_time(tm_year=2025, tm_mon=2, tm_mday=7, tm_hour=19, tm_min=40, tm_sec=22, tm_wday=4, tm_yday=38, tm_isdst=0).npy"

saint_cls_embeddings = np.load(saint_embeddings_path)
print(saint_cls_embeddings.shape)


(118108, 32)


In [11]:
train_df = pd.read_csv("../2_dataset/final/train_df.csv")
train_df.head(10)


FileNotFoundError: [Errno 2] No such file or directory: '../2_dataset/final/train_df.csv'

In [None]:
train_y = pd.read_csv("../2_dataset/final/train_y_df.csv")
test_df = pd.read_csv("../2_dataset/final/test_df.csv")

print(f"test_df.shape: {test_df.shape}\ttrain_y.shape{train_y.shape}")


(76770, 1)

In [22]:
test_y_df = pd.read_csv("../2_dataset/final/test_y_df.csv")
test_y_df['isFraud'] = test_y_df['isFraud'].astype(np.float32)
test_y_df['isFraud']


In [7]:
train_y.shape


(76770, 1)

In [None]:
val_df = pd.read_csv("../2_dataset/final/val_df.csv")
val_y_df = pd.read_csv("../2_dataset/final/val_y_df.csv")

val_df.head(10)


In [29]:
val_y_df


Unnamed: 0,isFraud
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
17711,0.0
17712,0.0
17713,0.0
17714,0.0


#### Splitting SAINT embeddings

In [None]:
print(f"Total embed shape: {saint_cls_embeddings.shape}")



In [8]:
# Compute 65% of the total rows
total_rows = saint_cls_embeddings.shape[0]
train_size = int(0.65 * total_rows)  # 65% of 118108

# Slice the top 65%
train_embeddings = saint_cls_embeddings[:train_size]  # First 65%
print(f"Train embed shape: {train_embeddings.shape}")


In [9]:
total_rows = saint_cls_embeddings.shape[0]
test_size = int(0.8 * total_rows) 

val_embeddings = saint_cls_embeddings[train_size:test_size]
print(len(val_embeddings)) # 15% of total rows.)
      
test_embeddings = saint_cls_embeddings[test_size:]
# print(f"Total embed shape: {saint_cls_embeddings.shape}")
# print(f"Train embed shape: {test_embeddings.shape}")


#### FAISS Index and Similarity Search

In [10]:
def create_index(num_embeddings, dimension):
    # num_embeddings = 76770
    # dimension = 32

    index = faiss.IndexFlatL2(dimension)  # L2 similarity
    index.add(train_embeddings)  # index of pre-computed embeddings

    # k = 120  # as best result for 120
    return index

# query_vector = np.random.random((1, dimension)).astype("float32") #random for now
# print(k)


In [11]:
def search_faiss(query_vector):

    index = create_index(76770, 32)

    # Convert PyTorch tensor to NumPy
    if isinstance(query_vector, torch.Tensor):
        query_vector = query_vector.detach().cpu().numpy()

    query_vector = query_vector.astype("float32").reshape(1, -1)

    distances, indices = index.search(query_vector, k=120) # by default using euclidean distance for similarity
    indices = indices.flatten()
    
    return distances, indices

# distances, indices = index.search(
#     query_vector, k
# )  

# print("Input Sample embedding:", query_vector)
# print("Indices of nearest neighbors:", indices)
# print("L2 norm distances", distances)


#### L2 distance component

In [12]:
def compute_similarity(distances, dropout=0.2):

    # print(f"distances.shape before: {distances.shape}")
    distances = distances.flatten()
    # print(f"distances.shape after: {distances.shape}")

    # Apply softmax to the negative distances
    similarities = np.exp(-distances)
    softmax_scores = similarities / np.sum(similarities)

    # Apply dropout (randomly zero out some softmax scores)
    dropout_mask = np.random.binomial(1, 1 - dropout, size=softmax_scores.shape)
    dropped_softmax_scores = softmax_scores * dropout_mask

    final_softmax = dropped_softmax_scores / np.sum(dropped_softmax_scores)
    # how to weigh in the final embedding?
    return final_softmax, distances


#### mask to drop the dropped out values

In [13]:
def filter_by_mask(arr1, arr2, arr3):
    """
    Removes elements from arr2 and arr3 where corresponding indices in arr1 are zero.
    """
    mask = arr1 != 0  # Create a boolean mask where arr1 is nonzero
    return arr1[mask], arr2[mask], arr3[mask]

# S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)

# print("Filtered S_x_xi:", S_x_xi)
# print("Filtered indices:", indices)
# print("Filtered distances:", distances)



#### linear trasnform the value component

In [14]:
class MLP_L1(nn.Module):
    def __init__(self, input_dim):
        super(MLP_L1, self).__init__()
        self.layer1 = nn.Linear(input_dim, 32)
        self.activation1 = nn.SiLU()

        self.layer2 = nn.Linear(32, 32)
        self.activation2 = nn.SiLU()
        self.dropout2 = nn.Dropout(p=0.2)

        self.layer3 = nn.Linear(32, 32)  # Fix: Output should be 32
        self.activation3 = nn.SiLU()  # Fix: Apply SiLU activation

    def forward(self, x):
        x = self.activation1(self.layer1(x))
        x = self.dropout2(self.activation2(self.layer2(x)))
        x = self.activation3(self.layer3(x))  # Fix: Apply activation & dropout
        return x


# # Create model instance
# model = MLP_L1(l1_dist.shape[0])

# # Convert input to tensor and pass it through the model
# l1_dist_tensor = torch.tensor(l1_dist, dtype=torch.float32)

# w_v_l1 = model(l1_dist_tensor).detach().numpy()
# print(w_v_l1)

# print(w_v_l1.shape)


In [36]:
def compute_l1(distances):
    l1_dist = np.sqrt(distances)
    # print(l1_dist, l1_dist.shape)

    model = MLP_L1(l1_dist.shape[0])

    # Convert input to tensor and pass it through the model
    l1_dist_tensor = torch.tensor(l1_dist, dtype=torch.float32)

    w_v_l1 = model(l1_dist_tensor).detach().numpy()
    # print(w_v_l1)

    # print(w_v_l1.shape)

    return w_v_l1


In [15]:
# Modified MLP to output shape (60,)
class MLP_Wy(nn.Module):
    def __init__(self, input_dim):
        super(MLP_Wy, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 32)  # Output layer (1 neuron)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)  # Shape (60, 1)
        return x.squeeze(1)  # Shape (60,)

# Instantiate MLP with input_dim=32 (from Wy)
 # Expected: (60,)


In [18]:
train_y


Unnamed: 0,isFraud
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
76765,0.0
76766,0.0
76767,0.0
76768,1.0


In [16]:
def compute_wy(indices):
    y_i = train_y['isFraud'].loc[indices].values
    # print(f"y_i.shape: {y_i.shape}")


# Convert to PyTorch tensor
    feature_target_tensor = torch.tensor(y_i, dtype=torch.long)

# Define Wy: An embedding layer to map to 32-dim space
    embedding_dim = 32
    num_classes = 2  # Since input values are 0 or 1

    Wy = nn.Embedding(num_classes, embedding_dim)

    mlp = MLP_Wy(input_dim=embedding_dim)

# Compute embeddings using Wy
    embeddings = Wy(feature_target_tensor)  # Shape: (60, 32)

    # Pass embeddings through MLP
    w_y = mlp(embeddings)  # Shape: (60,)

    # print("MLP Output Shape:", w_y.shape) 

    return w_y


In [17]:
def compute_value(w_v_l1, w_y):
    # Compute the dot product of w_v_l1 and w_y
    w_y_npy = w_y.detach().numpy()
    value = w_y_npy + w_v_l1
    return value


#### Reshaping S to do S * V

In [18]:
def compute_z_in(S_x_xi, value):
    S_x_xi = S_x_xi.reshape(1, -1)
    # print(S_x_xi.shape)  # Output should be (1, 95)

    # result = (S_x_xi @ value)

    # # Summation over all elements (since it's 1D)
    # Z = np.sum(result)

    # Assuming S_x_xi is (1, 95) and value is (95, 32)
    numerator = np.sum(S_x_xi @ value)  # Sum the weighted contributions (scalar)
    denominator = np.sum(S_x_xi)        # Total sum of weights (scalar)
    z_in = numerator / denominator         # Weighted average as a single scalar

    # print("Z:", z_in)
    # print("Z Shape:", z_in.shape)  # Expected: torch.Size([])


    return z_in


In [24]:
x_test = test_df.iloc[44].values
x_test.shape


(182,)

### Model Training

In [25]:
import torch
import torch.nn as nn

class Predictor(nn.Module):
    def __init__(self, input_emb_dim, hidden_dim=32, dropout_prob=0.2):
        super(Predictor, self).__init__()
        
        self.input_dim = input_emb_dim + 1  # Adding 2 for weighted_avg and f_z_in

        self.block1 = nn.Sequential(
            nn.LayerNorm(self.input_dim),
            nn.Linear(self.input_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block2 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block3 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, input_emb, weighted_avg):
        # Ensure correct shape for scalar inputs
        weighted_avg = weighted_avg.unsqueeze(-1)  
    
        # Concatenate all inputs
        combined = torch.cat([input_emb, weighted_avg], dim=-1)

        # Pass through MLP blocks
        x = self.block1(combined)
        x = self.block2(x)
        x = self.block3(x)
        x = self.output_layer(x)
        return x


In [24]:
# def process_samples(test_df, test_embeddings):
#     test_i = test_df.iloc[i].values #shape (182,)
#     query_vector = test_embeddings[i] #shape (32,)
#     distances, indices = search_faiss(query_vector) # both shape (120,) and flatten
#     S_x_xi, distances = compute_similarity(distances)
#     S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
#     w_v_l1 = compute_l1(distances) #shape (32,)
#     w_y = compute_wy(indices) #shape (32,)
#     value = compute_value(w_v_l1, w_y)
#     z_in = compute_z_in(S_x_xi, value)

#     input_to_mlp = test_i * z_in


In [26]:
def process_samples(train_df, train_embeddings):
    """Processes all samples and returns input tensor and labels."""
    input_list = []
    labels = []


    for i in range(len(train_df)):  # Process all samples
        test_i = torch.tensor(train_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
        query_embedding = torch.tensor(train_embeddings[i], dtype=torch.float32)  # shape (32,)

        distances, indices = search_faiss(query_embedding)
        S_x_xi, distances = compute_similarity(distances)
        S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
        w_v_l1 = compute_l1(distances)  # shape (32,)
        w_y = compute_wy(indices)  # shape (32,)
        value = compute_value(w_v_l1, w_y)
        z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)


        # Append only test_i (input_emb) and z_in (weighted_avg), not query_embedding
        z_in = torch.tensor(z_in, dtype=torch.float32)

        input_list.append((test_i, z_in))

        labels.append(torch.tensor(train_y.iloc[i]['isFraud'], dtype=torch.float32))  # Assuming label is in df



    return input_list, labels


In [30]:
def val_process_samples(test_df, test_embeddings):
    """Processes all samples and returns input tensor and labels."""
    input_list = []
    labels = []


    for i in range(len(test_df)):  # Process all samples
        test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
        query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

        distances, indices = search_faiss(query_embedding)
        S_x_xi, distances = compute_similarity(distances)
        S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
        w_v_l1 = compute_l1(distances)  # shape (32,)
        w_y = compute_wy(indices)  # shape (32,)
        value = compute_value(w_v_l1, w_y)
        z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)


        # Append only test_i (input_emb) and z_in (weighted_avg), not query_embedding
        z_in = torch.tensor(z_in, dtype=torch.float32)

        input_list.append((test_i, z_in))

        labels.append(torch.tensor(val_y_df.iloc[i]['isFraud'], dtype=torch.float32))  # Assuming label is in df



    return input_list, labels


In [None]:
# input_list, labels = process_samples(train_df, train_embeddings)


In [None]:
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import average_precision_score
import torch

def train_model(test_df, test_embeddings, model, optimizer, criterion, batch_size=256, epochs=25):
    model.train()
    
    # Process data
    input_list, labels = process_samples(train_df, train_embeddings)
    val_input_list, val_labels = process_samples(val_df, val_embeddings)

    # Create DataLoader
    dataset = TensorDataset(
        torch.stack([item[0] for item in input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in input_list]),  # z_in (weighted_avg)
        torch.stack(labels)  # Labels
    )

    val_dataset = TensorDataset(
        torch.stack([item[0] for item in val_input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in val_input_list]),  # z_in (weighted_avg)
        torch.stack(val_labels)  # Labels
    )


    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Training loop for multiple epochs
    for epoch in range(epochs):
        epoch_loss = 0  # Track loss for each epoch
        all_targets = []
        all_outputs = []

        for batch in dataloader:
            input_emb, z_in, target = batch
            target = target.unsqueeze(-1)  # Make target shape (batch_size, 1)

            optimizer.zero_grad()
            output = model(input_emb, z_in)

            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate loss for epoch
            
            # Collect predictions & targets for AUCPR
            all_outputs.append(output.detach().cpu())  # Move to CPU to avoid memory issues
            all_targets.append(target.detach().cpu())

        # Compute AUCPR at the end of the epoch
        all_outputs = torch.cat(all_outputs).numpy()
        all_targets = torch.cat(all_targets).numpy()
        aucpr = average_precision_score(all_targets, all_outputs)
        # print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(dataloader):.4f}, AUCPR: {aucpr:.4f}")

        # Validation Step
        model.eval()
        val_loss = 0
        all_val_targets = []
        all_val_outputs = []

        with torch.no_grad():  # No gradient computation for validation
            for batch in val_dataloader:
                input_emb, z_in, target = batch
                target = target.unsqueeze(-1)

                output = model(input_emb, z_in)
                loss = criterion(output, target)

                val_loss += loss.item()
                
                all_val_outputs.append(output.cpu())
                all_val_targets.append(target.cpu())

        # Compute AUCPR for validation set
        all_val_outputs = torch.cat(all_val_outputs).numpy()
        all_val_targets = torch.cat(all_val_targets).numpy()
        val_aucpr = average_precision_score(all_val_targets, all_val_outputs)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {epoch_loss/len(dataloader):.4f}, Train AUCPR: {aucpr:.4f}, "
              f"Val Loss: {val_loss/len(val_dataloader):.4f}, Val AUCPR: {val_aucpr:.4f}")


# Model, optimizer, and loss function
input_emb_dim = 182  # Assuming this based on test_df features
model = Predictor(input_emb_dim=input_emb_dim)
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss

# Call training loop (assuming test_df and test_embeddings are available)
train_model(train_df, train_embeddings, model, optimizer, criterion, epochs=100)


Epoch [1/100], Train Loss: 0.3525, Train AUCPR: 0.0367, Val Loss: 0.1571, Val AUCPR: 0.0353
Epoch [2/100], Train Loss: 0.1382, Train AUCPR: 0.1674, Val Loss: 0.1625, Val AUCPR: 0.0365
Epoch [3/100], Train Loss: 0.1275, Train AUCPR: 0.2569, Val Loss: 0.1660, Val AUCPR: 0.0374
Epoch [4/100], Train Loss: 0.1210, Train AUCPR: 0.2979, Val Loss: 0.1709, Val AUCPR: 0.0378
Epoch [5/100], Train Loss: 0.1180, Train AUCPR: 0.3164, Val Loss: 0.1746, Val AUCPR: 0.0379
Epoch [6/100], Train Loss: 0.1162, Train AUCPR: 0.3284, Val Loss: 0.1772, Val AUCPR: 0.0378
Epoch [7/100], Train Loss: 0.1149, Train AUCPR: 0.3374, Val Loss: 0.1792, Val AUCPR: 0.0378
Epoch [8/100], Train Loss: 0.1140, Train AUCPR: 0.3450, Val Loss: 0.1808, Val AUCPR: 0.0377
Epoch [9/100], Train Loss: 0.1132, Train AUCPR: 0.3512, Val Loss: 0.1821, Val AUCPR: 0.0376
Epoch [10/100], Train Loss: 0.1126, Train AUCPR: 0.3571, Val Loss: 0.1831, Val AUCPR: 0.0376
Epoch [11/100], Train Loss: 0.1120, Train AUCPR: 0.3624, Val Loss: 0.1840, Val 