In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from google.colab import drive
drive.mount('/content/drive')

In [15]:
file_pathA = '/content/drive/My Drive/THESIS-UOM/DATASETS_KARAKASIDIS/POW_A_10000.csv'
file_pathB = '/content/drive/My Drive/THESIS-UOM/DATASETS_KARAKASIDIS/POW_B_1_10000.csv'
dataA = pd.read_csv(file_pathA, header=None)
dataB = pd.read_csv(file_pathB, header=None)
column_namesA = ["ID_A", "Last_Name_A", "First_Name_A", "Middle_Name_A", "Address_A", "City_A", "Age_A", "Race_A", "NL_A", "Gender_A"]
column_namesB = ["ID_B", "Last_Name_B", "First_Name_B", "Middle_Name_B", "Address_B", "City_B", "Age_B", "Race_B", "NL_B", "Gender_B"]
dataA.columns = column_namesA
dataB.columns = column_namesB
dataA = dataA.iloc[:100]
dataB = dataB.iloc[:100]

In [16]:
# Combine first 5 columns into a single text field
dataA["TextA"] = dataA.iloc[:, 1:6].astype(str).agg(' '.join, axis=1)
dataB["TextB"] = dataB.iloc[:, 1:6].astype(str).agg(' '.join, axis=1)
# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L12-v2') # all-MiniLM-L12-v2, paraphrase-MiniLM-L6-v2, roberta-large-nli-stsb-mean-tokens, distilbert-base-nli-stsb-mean-tokens
def generate_embeddings_in_batches(data, text_column, batch_size=32):
    embeddings = []
    for start_idx in range(0, len(data), batch_size):
        batch_texts = data[text_column].iloc[start_idx:start_idx+batch_size].tolist()
        batch_embeddings = model.encode(batch_texts, batch_size=batch_size)
        embeddings.extend(batch_embeddings)
    return embeddings
# Generate embeddings for dataA and dataB
dataA["Embedding_A"] = generate_embeddings_in_batches(dataA, "TextA", batch_size=32)
dataB["Embedding_B"] = generate_embeddings_in_batches(dataB, "TextB", batch_size=32)

In [18]:
def normalize_embeddings(embeddings):
    mean = np.mean(embeddings, axis=0)
    std_dev = np.std(embeddings, axis=0)
    normalized_embeddings = (embeddings - mean) / (std_dev + 1e-6)  # Add small epsilon to avoid division by zero
    return normalized_embeddings
def add_dp_noise_to_embeddings(embeddings, epsilon, sensitivity):
    # Normalize the embeddings before adding noise
    normalized_embeddings = normalize_embeddings(embeddings)
    # Calculate the standard deviation for the noise
    std_dev = sensitivity / epsilon  # Sensitivity / Privacy Budget
    # Generate Gaussian noise
    noise = np.random.normal(loc=0.0, scale=std_dev, size=normalized_embeddings.shape)
    # Add noise to the normalized embeddings
    noisy_embeddings = normalized_embeddings + noise
    return noisy_embeddings
# Adds Gaussian noise for Differential Privacy to the embeddings
dataA["Embedding_A"] = dataA["Embedding_A"].apply(lambda x: add_dp_noise_to_embeddings(x, epsilon=10.0, sensitivity=1.0))
dataB["Embedding_B"] = dataB["Embedding_B"].apply(lambda x: add_dp_noise_to_embeddings(x, epsilon=10.0, sensitivity=1.0))

In [19]:
def creatingPairs_chunked(dataA, dataB, chunk_size=1000):
    pairs_list = []
    for i in range(0, len(dataA), chunk_size):
        chunkA = dataA.iloc[i:i+chunk_size]
        for j in range(0, len(dataB), chunk_size):
            chunkB = dataB.iloc[j:j+chunk_size]
            pairs = chunkA.assign(key=1).merge(chunkB.assign(key=1), on='key').drop('key', axis=1)
            pairs['Matched'] = (pairs['ID_A'] == pairs['ID_B']).astype(int)
            pairs_list.append(pairs[['Embedding_A', 'Embedding_B', 'Matched']])
    pairs = pd.concat(pairs_list, ignore_index=True)
    return pairs
# Generating record pairs
pairs = creatingPairs_chunked(dataA=dataA, dataB=dataB)
train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)
train_pairs, valid_pairs = train_test_split(train_pairs, test_size=0.2, random_state=42)

In [20]:
class EmbeddingDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        emb_a = torch.tensor(self.data.iloc[idx]["Embedding_A"], dtype=torch.float32)
        emb_b = torch.tensor(self.data.iloc[idx]["Embedding_B"], dtype=torch.float32)
        label = torch.tensor(self.data.iloc[idx]["Matched"], dtype=torch.float32)
        return emb_a, emb_b, label
# Create datasets
train_dataset = EmbeddingDataset(train_pairs)
valid_dataset = EmbeddingDataset(valid_pairs)
test_dataset = EmbeddingDataset(test_pairs)
# Create loaders
train_loader = DataLoader(train_dataset, shuffle=True)
valid_loader = DataLoader(valid_dataset, shuffle=False)
test_loader = DataLoader(test_dataset, shuffle=False)

In [21]:
class SiameseNN(nn.Module):
    def __init__(self, input_dim=384):
        super(SiameseNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )
    def forward(self, emb_a, emb_b):
        diff = torch.abs(emb_a - emb_b)
        return self.fc(diff)
# Initialize Siamese Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=np.array(train_pairs["Matched"]))
pos_weight = torch.tensor(class_weights[1], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [22]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for emb_a, emb_b, labels in train_loader:
            emb_a, emb_b, labels = emb_a.to(device), emb_b.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(emb_a, emb_b)
            loss = criterion(outputs.squeeze(), labels.squeeze())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for emb_a, emb_b, labels in valid_loader:
                emb_a, emb_b, labels = emb_a.to(device), emb_b.to(device), labels.to(device)
                outputs = model(emb_a, emb_b)
                preds = (outputs > 0.5).float()
                correct += (preds.squeeze() == labels.squeeze()).sum().item()
                total += labels.size(0)
        print(f"Epoch {epoch+1}/{epochs}: Loss {total_loss / len(train_loader):.4f}, Accuracy {correct / total:.4f}")
# Train the Model
train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=5)

Epoch 1/5: Loss 0.8177, Accuracy 0.9894
Epoch 2/5: Loss 0.6006, Accuracy 0.9894
Epoch 3/5: Loss 0.4348, Accuracy 0.9900
Epoch 4/5: Loss 0.2513, Accuracy 0.9962
Epoch 5/5: Loss 0.1075, Accuracy 0.9938


In [23]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for emb_a, emb_b, labels in test_loader:
            emb_a, emb_b, labels = emb_a.to(device), emb_b.to(device), labels.to(device).unsqueeze(1)
            outputs = model(emb_a, emb_b)
            preds = (outputs > 0.5).float()
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1-score: {f1:.4f}")
# Test the Model
evaluate_model(model, test_loader)

Test Accuracy: 0.9910
Test Precision: 0.8889
Test Recall: 0.3200
Test F1-score: 0.4706
