In [41]:
import pandas as pd
import numpy as np
import faiss
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt


In [45]:
# loading the entire test embeddings
path = "../5_embeddings/cls_embeddings_time.struct_time(tm_year=2025, tm_mon=2, tm_mday=7, tm_hour=19, tm_min=40, tm_sec=22, tm_wday=4, tm_yday=38, tm_isdst=0).npy"

cls_embeddings = np.load(path)
print(cls_embeddings.shape)


(118108, 32)


In [46]:
train_df = pd.read_csv("../2_dataset/final/train_df.csv")
train_df

Unnamed: 0.1,cls,ProductCD,card4,card6,P_emaildomain,Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,4,2,1,2,-0.291883,-0.329939,0.108390,-0.145421,-0.399322,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
1,0,4,3,2,16,0.892993,0.871243,-0.359702,0.680504,-0.412094,...,-0.030054,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,0,4,2,2,1,-1.594876,-1.467121,8.134522,-0.109308,0.711822,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
3,0,4,3,2,19,-0.123148,-0.156138,-0.422421,1.487250,-0.265218,...,0.341765,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
4,0,4,3,2,16,1.611964,1.677853,-0.317889,-0.081355,-0.265218,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76765,0,4,3,2,16,-1.327921,-1.260784,-0.113217,-0.653259,-1.606254,...,1.009878,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76766,0,4,2,2,25,0.675641,0.648266,-0.075376,-0.002802,0.756523,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76767,0,4,1,1,16,0.418154,0.377752,0.150412,-1.485918,-0.226903,...,-0.227583,0.393449,0.090945,0.276274,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76768,0,4,3,2,19,0.605578,0.576883,-0.322279,-0.182963,0.577719,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142


In [91]:
train_y = pd.read_csv("../2_dataset/final/train_y_df.csv")
# train_y['isFraud'] = train_y['isFraud'].astype(np.float32)
train_y.shape


(76770, 1)

In [92]:
test_df = pd.read_csv("../2_dataset/final/test_df.csv")
test_df.shape

(23622, 182)

In [93]:
test_y_df = pd.read_csv("../2_dataset/final/test_y_df.csv")
test_y_df['isFraud'] = test_y_df['isFraud'].astype(np.float32)


In [94]:
test_y_df['isFraud']


0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
23617    0.0
23618    0.0
23619    1.0
23620    0.0
23621    0.0
Name: isFraud, Length: 23622, dtype: float32

In [95]:
val_df = pd.read_csv("../2_dataset/final/val_df.csv")
val_y_df = pd.read_csv("../2_dataset/final/val_y_df.csv")

## Data exploration

In [96]:
train_combined = pd.concat([train_df, train_y], axis=1)
for i in train_combined.columns:
    print(i)


cls
ProductCD
card4
card6
P_emaildomain
Unnamed: 0
TransactionDT
TransactionAmt
card1
card2
card3
card5
addr1
addr2
C1
C2
C3
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
D1
D10
D15
V12
V13
V14
V15
V16
V17
V18
V19
V20
V21
V22
V23
V24
V25
V26
V27
V28
V29
V30
V31
V32
V33
V34
V53
V54
V55
V56
V57
V58
V59
V60
V61
V62
V63
V64
V65
V66
V67
V68
V69
V70
V71
V72
V73
V74
V75
V76
V77
V78
V79
V80
V81
V82
V83
V84
V85
V86
V87
V88
V89
V90
V91
V92
V93
V94
V95
V96
V97
V98
V99
V100
V101
V102
V103
V104
V105
V106
V107
V108
V109
V110
V111
V112
V113
V114
V115
V116
V117
V118
V119
V120
V121
V122
V123
V124
V125
V126
V127
V128
V129
V130
V131
V132
V133
V134
V135
V136
V137
V279
V280
V281
V282
V283
V284
V285
V286
V287
V288
V289
V290
V291
V292
V293
V294
V295
V296
V297
V298
V299
V300
V301
V302
V303
V304
V305
V306
V307
V308
V309
V310
V311
V312
V313
V314
V315
V316
V317
V318
V319
V320
V321
isFraud


In [97]:
correlation_matrix = train_combined.corr()

# Extract correlation with target variable 'isFraud'
target_correlation = correlation_matrix["isFraud"].sort_values(ascending=False)

# Plot the heatmap
# plt.figure(figsize=(10, 6))
# plt.barh(target_correlation.index, target_correlation.values, color="skyblue")
# plt.xlabel("Correlation Coefficient")
# plt.ylabel("Features")
# plt.title("Feature Correlation with isFraud")
# plt.grid(True)
# plt.show()

# Display the correlation matrix as a table
target_correlation = correlation_matrix["isFraud"].sort_values(ascending=False)

# Print correlation with target variable
# print("Correlation with isFraud:\n", target_correlation[0:15])

for i in target_correlation:
    print(i)


# V86          0.238604
# V87          0.233544
# V79          0.178765
# V94          0.172088


1.0
0.23860373552977954
0.2335443492578088
0.17876509632033027
0.17208818086369293
0.17140777222739775
0.17129754151082915
0.1694509533059694
0.16869624549983075
0.16856514581061696
0.16807522199501426
0.16784609669254302
0.16673519181013005
0.16494405336149928
0.1635479726062006
0.16223883339984835
0.1609492818314699
0.15539194280744686
0.1532637205184111
0.15285396198355342
0.15242327369684092
0.15207969986721587
0.15136940132184074
0.15019903227643822
0.14906566950159525
0.1471338535487183
0.14695599712549678
0.1451580952545996
0.14005471970353808
0.13982554431256586
0.1368602756038374
0.13448693076368148
0.13371257061792763
0.13296303386586886
0.1319550131385903
0.13012982907439313
0.13002389050113175
0.12737547370279503
0.12611801988193777
0.12436065686779674
0.12012175924886781
0.12004693996679176
0.11508280384875622
0.10609496545951427
0.10446228468029638
0.09977475748977936
0.09723238428976462
0.09565859643023265
0.08350499518970603
0.08319386048403277
0.08036134635849733
0.065

In [98]:
train_combined["V86"]
column_range = train_df["V86"].max() - train_df["V86"].min()

# Print the range
print(f"Range of 'V86': {column_range} low: {train_df["V86"].min()}  high: {train_df["V86"].max()}")


SyntaxError: invalid syntax (1855591777.py, line 5)

In [None]:
train_combined["V86"].unique()


In [None]:
train_combined["V94"].unique()

# V86          0.238604
# V87          0.233544
# V79          0.178765
# V94          0.172088


## Splitting Embeddings

In [None]:
# Compute 65% of the total rows
total_rows = cls_embeddings.shape[0]
train_size = int(0.65 * total_rows)  # 65% of 118108

# Slice the top 65%
train_embeddings = cls_embeddings[:train_size]  # First 65%

# print(f"Total embed shape: {cls_embeddings.shape}")
# print(f"Train embed shape: {train_embeddings.shape}")


In [None]:
total_rows = cls_embeddings.shape[0]
test_size = int(0.8 * total_rows) 

test_embeddings = cls_embeddings[test_size:]
# print(f"Total embed shape: {cls_embeddings.shape}")
# print(f"Train embed shape: {test_embeddings.shape}")


In [None]:
val_embeddings = cls_embeddings[train_size:test_size]
len(val_embeddings) # 15% of total rows.

17716

## Faiss Index and Similarity Search

In [None]:
def create_index(num_embeddings, dimension):

    # num_embeddings = 76770
    # dimension = 32

    index = faiss.IndexFlatL2(dimension)  # L2 similarity

    index.add(train_embeddings)  # index of pre-computed embeddings

    # k = 120  # as best result for 120

    return index



In [None]:
def search_faiss(query_vector):

    index = create_index(76770, 32)

    # Convert PyTorch tensor to NumPy
    if isinstance(query_vector, torch.Tensor):
        query_vector = query_vector.detach().cpu().numpy()

    query_vector = query_vector.astype("float32").reshape(1, -1)


    query_vector = query_vector.astype("float32").reshape(1, -1)

    distances, indices = index.search(query_vector, k=120) # by default using euclidean distance for similarity
    indices = indices.flatten()


    return distances, indices

# distances, indices = index.search(
#     query_vector, k
# )  

# print("Input Sample embedding:", query_vector)
# print("Indices of nearest neighbors:", indices)
# print("L2 norm distances", distances)


## L2 Distance Component

In [None]:
def compute_similarity(distances, dropout=0.2):

    # print(f"distances.shape before: {distances.shape}")
    distances = distances.flatten()
    # print(f"distances.shape after: {distances.shape}")

    # Apply softmax to the negative distances
    similarities = np.exp(-distances)
    softmax_scores = similarities / np.sum(similarities)

    # Apply dropout (randomly zero out some softmax scores)
    dropout_mask = np.random.binomial(1, 1 - dropout, size=softmax_scores.shape)
    dropped_softmax_scores = softmax_scores * dropout_mask

    final_softmax = dropped_softmax_scores / np.sum(dropped_softmax_scores)
    # how to weigh in the final embedding?
    return final_softmax, distances


## Mask To Drop The Dropped Out Values

In [None]:
def filter_by_mask(arr1, arr2, arr3):
    """
    Removes elements from arr2 and arr3 where corresponding indices in arr1 are zero.
    """
    mask = arr1 != 0  # Create a boolean mask where arr1 is nonzero
    return arr1[mask], arr2[mask], arr3[mask]

# S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)

# print("Filtered S_x_xi:", S_x_xi)
# print("Filtered indices:", indices)
# print("Filtered distances:", distances)



## Linear Trasnform(with MLPs), Value Component and Shap Features

In [None]:
class MLP_L1(nn.Module):
    def __init__(self, input_dim):
        super(MLP_L1, self).__init__()
        self.layer1 = nn.Linear(input_dim, 32)
        self.activation1 = nn.SiLU()

        self.layer2 = nn.Linear(32, 32)
        self.activation2 = nn.SiLU()
        self.dropout2 = nn.Dropout(p=0.2)

        self.layer3 = nn.Linear(32, 32)  # Fix: Output should be 32
        self.activation3 = nn.SiLU()  # Fix: Apply SiLU activation

    def forward(self, x):
        x = self.activation1(self.layer1(x))
        x = self.dropout2(self.activation2(self.layer2(x)))
        x = self.activation3(self.layer3(x))  # Fix: Apply activation & dropout
        return x


# # Create model instance
# model = MLP_L1(l1_dist.shape[0])

# # Convert input to tensor and pass it through the model
# l1_dist_tensor = torch.tensor(l1_dist, dtype=torch.float32)

# w_v_l1 = model(l1_dist_tensor).detach().numpy()
# print(w_v_l1)

# print(w_v_l1.shape)


In [None]:
def compute_l1(distances):
    l1_dist = np.sqrt(distances)
    # print(l1_dist, l1_dist.shape)

    model = MLP_L1(l1_dist.shape[0])

    # Convert input to tensor and pass it through the model
    l1_dist_tensor = torch.tensor(l1_dist, dtype=torch.float32)

    w_v_l1 = model(l1_dist_tensor).detach().numpy()
    # print(w_v_l1)

    # print(w_v_l1.shape)

    return w_v_l1


In [None]:
# Modified MLP to output shape (60,)
class MLP_Wy(nn.Module):
    def __init__(self, input_dim):
        super(MLP_Wy, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 32)  # Output layer (1 neuron)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)  # Shape (60, 1)
        return x.squeeze(1)  # Shape (60,)

# Instantiate MLP with input_dim=32 (from Wy)
 # Expected: (60,)


In [None]:
def compute_wy(indices):
    y_i = train_y['isFraud'].loc[indices].values
    # print(f"y_i.shape: {y_i.shape}")


# Convert to PyTorch tensor
    feature_target_tensor = torch.tensor(y_i, dtype=torch.long)

# Define Wy: An embedding layer to map to 32-dim space
    embedding_dim = 32
    num_classes = 2  # Since input values are 0 or 1

    Wy = nn.Embedding(num_classes, embedding_dim)

    mlp = MLP_Wy(input_dim=embedding_dim)

# Compute embeddings using Wy
    embeddings = Wy(feature_target_tensor)  # Shape: (60, 32)

    # Pass embeddings through MLP
    w_y = mlp(embeddings)  # Shape: (60,)

    # print("MLP Output Shape:", w_y.shape) 

    return w_y


In [None]:
def compute_imp_features(indices):
    v_86 = train_df['C14'].loc[indices].values
    feature_target_tensor = torch.tensor(v_86, dtype=torch.float32).unsqueeze(-1)
    # print(f"V86 : {v_86}")


    v_87 = train_df['C13'].loc[indices].values
    feature_target_tensor_1 = torch.tensor(v_87, dtype=torch.float32).unsqueeze(-1)

    v_79 = train_df['C1'].loc[indices].values
    feature_target_tensor_2 = torch.tensor(v_79, dtype=torch.float32).unsqueeze(-1)

    v_94 = train_df['C2'].loc[indices].values
    feature_target_tensor_3 = torch.tensor(v_94, dtype=torch.float32).unsqueeze(-1)
    # print(feature_target_tensor, feature_target_tensor_1, feature_target_tensor_2, feature_target_tensor_3)

    # Convert to PyTorch tensor

    # Define Wx: A linear layer to project to 32-dim space
    input_dim = 1  # Since each input is a single continuous value
    embedding_dim = 32

    # Define MLP and projection layers
    W_feat = nn.Linear(input_dim, embedding_dim)

    mlp = MLP_Wy(input_dim=embedding_dim)

    # Process first variable
    embeddings_1 = W_feat(feature_target_tensor)  
    feat_x1 = mlp(embeddings_1)  
    # print("MLP Output Shape for Var 1:", feat_x1.shape)
    feat_x1 = feat_x1.detach().numpy()

    # Process second variable
    embeddings_2 = W_feat(feature_target_tensor_1)  
    feat_x2 = mlp(embeddings_2)  
    # print("MLP Output Shape for Var 2:", feat_x2.shape)
    feat_x2 = feat_x2.detach().numpy()

    # Process third variable
    embeddings_3 = W_feat(feature_target_tensor_2)  
    feat_x3 = mlp(embeddings_3)  
    # print("MLP Output Shape for Var 3:", feat_x3.shape)
    feat_x3 = feat_x3.detach().numpy()

    # Process fourth variable
    embeddings_4 = W_feat(feature_target_tensor_3)  
    feat_x4 = mlp(embeddings_4)  
    # print("MLP Output Shape for Var 3:", feat_x4.shape)
    feat_x4 = feat_x4.detach().numpy()


    return feat_x1, feat_x2, feat_x3, feat_x4


In [None]:
def compute_value(w_v_l1, w_y):
    # Compute the dot product of w_v_l1 and w_y
    w_y_npy = w_y.detach().numpy()
    value = w_y_npy + w_v_l1
    return value


#### Reshaping S to do S * V

In [99]:
def compute_z_in(S_x_xi, value):
    S_x_xi = S_x_xi.reshape(1, -1)

    # Assuming S_x_xi is (1, 95) and value is (95, 32)
    numerator = np.sum(S_x_xi @ value)  # Sum the weighted contributions (scalar)
    denominator = np.sum(S_x_xi)        # Total sum of weights (scalar)
    z_in = numerator / denominator         # Weighted average as a single scalar

    return z_in


In [100]:
x_test = test_df.iloc[44].values
x_test.shape


(182,)

## Processing Samples

#### Old Code for testing

In [101]:

# test_i = test_df.iloc[10].values #shape (182,)
# query_vector = test_embeddings[10] #shape (32,)
# distances, indices = search_faiss(query_vector) # both shape (120,) and flatten
# S_x_xi, distances = compute_similarity(distances)
# S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
# w_v_l1 = compute_l1(distances) #shape (32,)
# w_y = compute_wy(indices) #shape (32,)


# feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices)
# #z_in 
# f_z_1 = compute_z_in(S_x_xi, feat_x_1)
# f_z_2 = compute_z_in(S_x_xi, feat_x2)
# f_z_3 = compute_z_in(S_x_xi, feat_x3)
# f_z_4 = compute_z_in(S_x_xi, feat_x4)

#f_z_1, f_z_2, f_z_3, f_z_4

# value = compute_value(w_v_l1, w_y)
# value.shape

# labels = test_y_df['isFraud']
# labels = torch.tensor(test_y_df['isFraud'].values, dtype=torch.float32)
# type(labels)

# type(test_df.iloc[34].values)


#----------------Without Batch Processing----------------
# def process_samples(test_df, test_embeddings):
#     """Processes all samples and returns input tensor and labels."""
#     input_list = []
#     labels = []


#     for i in range(len(test_df)):  # Process all samples
#         test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
#         query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

#         distances, indices = search_faiss(query_embedding)
#         S_x_xi, distances = compute_similarity(distances)
#         S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
#         w_v_l1 = compute_l1(distances)  # shape (32,)
#         w_y = compute_wy(indices)  # shape (32,)
#         value = compute_value(w_v_l1, w_y)
#         z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)

#         feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices) #v86 feature
#         f_z_in_1 = compute_z_in(S_x_xi, feat_x_1)

#         f_z_in_2 = compute_z_in(S_x_xi, feat_x2)

#         f_z_in_3 = compute_z_in(S_x_xi, feat_x3)

#         f_z_in_4 = compute_z_in(S_x_xi, feat_x4)

#         # Append only test_i (input_emb) and z_in (weighted_avg), not query_embedding
#         z_in = torch.tensor(z_in, dtype=torch.float32)
#         f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
#         f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
#         f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
#         f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

#         input_list.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))

#         labels.append(torch.tensor(test_y_df.iloc[i]['isFraud'], dtype=torch.float32))  # Assuming label is in df



#     return input_list, labels


In [102]:
def process_samples(test_df, test_embeddings, batch_size=1000, delay=0.5):
    """Processes samples in batches to prevent memory overflow."""
    input_list = []
    labels = []

    total_samples = len(test_df)
    num_batches = (total_samples + batch_size - 1) // batch_size  # Compute number of batches

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, total_samples)

        batch_inputs = []
        batch_labels = []

        for i in range(start_idx, end_idx):  # Process each sample in batch
            test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
            query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

            distances, indices = search_faiss(query_embedding)
            S_x_xi, distances = compute_similarity(distances)
            S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
            w_v_l1 = compute_l1(distances)  # shape (32,)
            w_y = compute_wy(indices)  # shape (32,)
            value = compute_value(w_v_l1, w_y)
            z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)

            feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices) #v86 feature
            f_z_in_1 = compute_z_in(S_x_xi, feat_x_1)
            f_z_in_2 = compute_z_in(S_x_xi, feat_x2)
            f_z_in_3 = compute_z_in(S_x_xi, feat_x3)
            f_z_in_4 = compute_z_in(S_x_xi, feat_x4)

            # Convert to tensors
            z_in = torch.tensor(z_in, dtype=torch.float32)
            f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
            f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
            f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
            f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

            batch_inputs.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))
            batch_labels.append(torch.tensor(train_y.iloc[i]['isFraud'], dtype=torch.float32))

        input_list.extend(batch_inputs)
        labels.extend(batch_labels)
        
        time.sleep(delay)  # Add delay to avoid system overload
    
    return input_list, labels


In [103]:
def val_process_samples(test_df, test_embeddings, batch_size=1000, delay=0.5):
    """Processes samples in batches to prevent memory overflow."""
    input_list = []
    labels = []

    total_samples = len(test_df)
    num_batches = (total_samples + batch_size - 1) // batch_size  # Compute number of batches

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, total_samples)

        batch_inputs = []
        batch_labels = []

        for i in range(start_idx, end_idx):  # Process each sample in batch
            test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
            query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

            distances, indices = search_faiss(query_embedding)
            S_x_xi, distances = compute_similarity(distances)
            S_x_xi, indices, distances = filter_by_mask(S_x_xi, indices, distances)
            w_v_l1 = compute_l1(distances)  # shape (32,)
            w_y = compute_wy(indices)  # shape (32,)
            value = compute_value(w_v_l1, w_y)
            z_in = compute_z_in(S_x_xi, value)  # Scalar (single value)

            feat_x_1, feat_x2, feat_x3, feat_x4 = compute_imp_features(indices=indices) #v86 feature
            f_z_in_1 = compute_z_in(S_x_xi, feat_x_1)
            f_z_in_2 = compute_z_in(S_x_xi, feat_x2)
            f_z_in_3 = compute_z_in(S_x_xi, feat_x3)
            f_z_in_4 = compute_z_in(S_x_xi, feat_x4)

            # Convert to tensors
            z_in = torch.tensor(z_in, dtype=torch.float32)
            f_z_in_1 = torch.tensor(f_z_in_1, dtype=torch.float32)
            f_z_in_2 = torch.tensor(f_z_in_2, dtype=torch.float32)
            f_z_in_3 = torch.tensor(f_z_in_3, dtype=torch.float32)
            f_z_in_4 = torch.tensor(f_z_in_4, dtype=torch.float32)

            batch_inputs.append((test_i, z_in, f_z_in_1, f_z_in_2, f_z_in_3, f_z_in_4))
            batch_labels.append(torch.tensor(val_y_df.iloc[i]['isFraud'], dtype=torch.float32))

        input_list.extend(batch_inputs)
        labels.extend(batch_labels)
        
        time.sleep(delay)  # Add delay to avoid system overload
    
    return input_list, labels


## Model Training

In [104]:
class ThreeBlockModel(nn.Module):
    def __init__(self, input_emb_dim, hidden_dim=32, dropout_prob=0.2):
        super(ThreeBlockModel, self).__init__()

        self.input_dim = input_emb_dim + 4  # Adding 2 for weighted_avg and f_z_in

        self.block1 = nn.Sequential(
            nn.LayerNorm(self.input_dim),
            nn.Linear(self.input_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block2 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block3 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, input_emb, weighted_avg, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4):
        # Ensure correct shape for scalar inputs
        weighted_avg = weighted_avg.unsqueeze(-1)  
        f_z_in = f_z_in.unsqueeze(-1)  
        f_z_in_2 = f_z_in_2.unsqueeze(-1)
        f_z_in_3 = f_z_in_3.unsqueeze(-1)
        f_z_in_4 = f_z_in_4.unsqueeze(-1)

        # Concatenate all inputs
        combined = torch.cat([input_emb, weighted_avg, f_z_in, f_z_in_3, f_z_in_4], dim=-1)

        # Pass through MLP blocks
        x = self.block1(combined)
        x = self.block2(x)
        x = self.block3(x)
        x = self.output_layer(x)
        return x


In [105]:
def train_model(test_df, test_embeddings, model, optimizer, criterion, batch_size=256, epochs=25):
    model.train()
    
    # Process data
    #TODO!!!
    input_list, labels = process_samples(train_df, train_embeddings)
    val_input_list, val_labels = val_process_samples(val_df, val_embeddings)


    # Create DataLoader with `f_z_in`
    dataset = TensorDataset(
        torch.stack([item[0] for item in input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in input_list]),  # z_in (weighted_avg)
        torch.stack([item[2] for item in input_list]),  # f_z_in (new scalar input)
        torch.stack([item[3] for item in input_list]),
        torch.stack([item[4] for item in input_list]),
        torch.stack([item[5] for item in input_list]),
        torch.stack(labels)  # Labels
    )

    val_dataset = TensorDataset(
        torch.stack([item[0] for item in val_input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in val_input_list]),  # z_in (weighted_avg)
        torch.stack([item[2] for item in val_input_list]),  # f_z_in (new scalar input)
        torch.stack([item[3] for item in val_input_list]),
        torch.stack([item[4] for item in val_input_list]),
        torch.stack([item[5] for item in val_input_list]),
        torch.stack(val_labels)  # Labels
    )
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


    # # Open file to log results
    # with open(log_file, "w") as f:
    #     f.write("Epoch,Loss,AUCPR\n")  # Write header

    # Training loop for multiple epochs
    for epoch in range(epochs):
        epoch_loss = 0  # Track loss for each epoch
        all_targets = []
        all_outputs = []

        for batch in dataloader:
            input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4, target = batch
            target = target.unsqueeze(-1)  # Make target shape (batch_size, 1)

            optimizer.zero_grad()
            output = model(input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4)  # Pass f_z_in to model

            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate loss for epoch
            
            # Collect predictions & targets for AUCPR
            all_outputs.append(output.detach().cpu())  # Move to CPU to avoid memory issues
            all_targets.append(target.detach().cpu())

        # Compute AUCPR at the end of the epoch
        all_outputs = torch.cat(all_outputs).numpy()
        all_targets = torch.cat(all_targets).numpy()

        aucpr = average_precision_score(all_targets, all_outputs)
        avg_loss = epoch_loss / len(dataloader)

        # VALIDATION Step
        model.eval()
        val_loss = 0
        all_val_targets = []
        all_val_outputs = []

        with torch.no_grad():  # No gradient computation for validation
            for batch in val_dataloader:
                input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4, target = batch
                target = target.unsqueeze(-1)

                output = model(input_emb, z_in, f_z_in, f_z_in_2, f_z_in_3, f_z_in_4)
                loss = criterion(output, target)

                val_loss += loss.item()
                
                all_val_outputs.append(output.cpu())
                all_val_targets.append(target.cpu())

        # Compute AUCPR for validation set
        all_val_outputs = torch.cat(all_val_outputs).numpy()
        all_val_targets = torch.cat(all_val_targets).numpy()
        val_aucpr = average_precision_score(all_val_targets, all_val_outputs)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {epoch_loss/len(dataloader):.4f}, Train AUCPR: {aucpr:.4f}, "
              f"Val Loss: {val_loss/len(val_dataloader):.4f}, Val AUCPR: {val_aucpr:.4f}")


        # print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, AUCPR: {aucpr:.4f}")

        # # Append results to file
        # with open(log_file, "a") as f:
        #     f.write(f"{epoch+1},{avg_loss:.4f},{aucpr:.4f}\n")


# Model, optimizer, and loss function
input_emb_dim = 182  # Assuming this based on test_df features
model = ThreeBlockModel(input_emb_dim=input_emb_dim)
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss

# Call training loop (assuming test_df and test_embeddings are available)
train_model(train_df, train_embeddings, model, optimizer, criterion, epochs=100)


Epoch [1/100], Train Loss: 0.2793, Train AUCPR: 0.0381, Val Loss: 0.1369, Val AUCPR: 0.1864
Epoch [2/100], Train Loss: 0.1322, Train AUCPR: 0.2211, Val Loss: 0.1265, Val AUCPR: 0.2825
Epoch [3/100], Train Loss: 0.1239, Train AUCPR: 0.2828, Val Loss: 0.1184, Val AUCPR: 0.3272
Epoch [4/100], Train Loss: 0.1186, Train AUCPR: 0.3122, Val Loss: 0.1155, Val AUCPR: 0.3371
Epoch [5/100], Train Loss: 0.1162, Train AUCPR: 0.3276, Val Loss: 0.1141, Val AUCPR: 0.3420
Epoch [6/100], Train Loss: 0.1148, Train AUCPR: 0.3378, Val Loss: 0.1132, Val AUCPR: 0.3459
Epoch [7/100], Train Loss: 0.1138, Train AUCPR: 0.3459, Val Loss: 0.1126, Val AUCPR: 0.3499
Epoch [8/100], Train Loss: 0.1130, Train AUCPR: 0.3523, Val Loss: 0.1122, Val AUCPR: 0.3532
Epoch [9/100], Train Loss: 0.1124, Train AUCPR: 0.3580, Val Loss: 0.1118, Val AUCPR: 0.3559
Epoch [10/100], Train Loss: 0.1119, Train AUCPR: 0.3627, Val Loss: 0.1115, Val AUCPR: 0.3584
Epoch [11/100], Train Loss: 0.1114, Train AUCPR: 0.3670, Val Loss: 0.1113, Val 