In [9]:
from nltk.tokenize import word_tokenize
import contractions
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

In [10]:
import pandas as pd
import numpy as np

df = pd.read_csv("quora.csv")
df.dropna(inplace=True)

df = df.drop(['id', 'qid1', 'qid2'], axis=1)

### Data Cleaning

Because we use contexual similarity models (BERT & Sentence-BERT embeddings), stopwords will not be removed, to preserve semantic and syntactic meaning. We can filter them out for TF-IDF

In [11]:
#Contractions handling
def expand_contractions(text):
    return contractions.fix(text)

df['question1'] = df['question1'].apply(expand_contractions)
df['question2'] = df['question2'].apply(expand_contractions)


# Clean punctuation and special characters
def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

df['question1'] = df['question1'].apply(remove_special_characters)
df['question2'] = df['question2'].apply(remove_special_characters)

#Remove URLs and special Characters
def remove_urls(text):
    cleaned_text = re.sub(r'http\S+', '', text)
    return cleaned_text

df['question1'] = df['question1'].apply(remove_urls)
df['question2'] = df['question2'].apply(remove_urls)


#Remove html tags func
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    return cleaned_text

df['question1'] = df['question1'].apply(remove_html_tags)
df['question2'] = df['question2'].apply(remove_html_tags)

### Common Words - Feature

In [12]:
q1 = list(df.question1)
q2 = list(df.question2)


def common_words(q1, q2):
    length = len(q1) + len(q2)
    common = q1.intersection(q2)
    common_norm = len(common) / length
    return common_norm



### Sentence-BERT (Sentence embeddings for contexual similarity)

Pretrained Sentence-BERT model (all-MiniLM-L6-v2) from HuggingFace. Maps sentences to dense 384-dimensional vector embeddings that capture semantic meaning

In [13]:
from sentence_transformers import SentenceTransformer, util
from rapidfuzz.fuzz import ratio
import torch
from tqdm import tqdm


transformer = "all-mpnet-base-v2"

def compute_fuzz_features(q1, q2):
    fuzz_scores = []
    for q1, q2 in zip(q1, q2):
        fuzz_scores.append([ratio(q1, q2) / 100])  # Normalize to [0, 1]
    return torch.tensor(fuzz_scores, dtype=torch.float32)


def sentence_bert_model_training(q1, q2, transformer='all-mpnet-base-v2', batch_size=1000):
    '''BERT Sentence Transformer embedding with batch processing'''
    model = SentenceTransformer(transformer)

    # Check if GPU is available and move model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Initialize lists to collect features
    all_embeddings_q1 = []
    all_embeddings_q2 = []
    all_features = []
    n_samples = len(q1)
    n_batches = int(np.ceil(n_samples / batch_size))

    print(f"Processing {n_samples} samples in {n_batches} batches...")
    for i in tqdm(range(n_batches), desc="Feature Extraction Batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, n_samples)
        q1_batch = q1[start_idx:end_idx]
        q2_batch = q2[start_idx:end_idx]

        # Encode questions in the current batch
        q1_embeddings = model.encode(q1_batch, convert_to_tensor=True, batch_size=92, show_progress_bar=False)
        q2_embeddings = model.encode(q2_batch, convert_to_tensor=True, batch_size=92, show_progress_bar=False)

        # Tokenize questions for lexical features
        q1_tokenized = [set(word_tokenize(q.lower())) for q in q1_batch]
        q2_tokenized = [set(word_tokenize(q.lower())) for q in q2_batch]

        batch_features = []
        for emb1, emb2, q1_words, q2_words in zip(q1_embeddings, q2_embeddings, q1_tokenized, q2_tokenized):
            # Compute cosine similarity
            cosine_sim = util.cos_sim(emb1, emb2).item()
            cosine_sim = torch.tensor([cosine_sim], device=emb1.device, dtype=torch.float32) # Dimensionality = 1

            # Create feature vector: [cosine_sim] + abs diff + elementwise product
            diff = torch.abs(emb1 - emb2) # Dimensionality = 384 (it is a tensor with the same dim with the embedding)
            mult = emb1 * emb2

            # Compute common words count and normalize
            common_count = common_words(q1_words, q2_words)
            union_count = len(q1_words.union(q2_words))
            common_word_count_norm = common_count / union_count if union_count > 0 else 0
            common_word_count_norm = torch.tensor([common_word_count_norm], device=emb1.device, dtype=torch.float32) # Dimensionality = 384 (it is a tensor with the same dim with the embedding)

            # Compute difference of word count and normalize
            word_count_diff = abs(len(q1_words) - len(q2_words))
            max_word_count = max(len(q1_words), len(q2_words), 1)
            length_diff = word_count_diff / max_word_count
            length_diff = torch.tensor([length_diff], device=emb1.device, dtype=torch.float32) # Dimensionality 1

            # Combine features into one vector
            feature_vector = torch.cat([cosine_sim, diff, mult, common_word_count_norm, length_diff]) # Dimensionality = 1 + 384 + 384 + 1 = 771
            batch_features.append(feature_vector)

        # Stack batch features and compute fuzz features
        features_tensor = torch.stack(batch_features)
        fuzz_tensor = compute_fuzz_features(q1_batch, q2_batch).to(device) # Dimensionality 1
        batch_feat = torch.cat([features_tensor, fuzz_tensor], dim=1) # Dimensionality = 1 + 384 + 384 + 1 + 1 = 772

        # Store embeddings and features
        all_embeddings_q1.append(q1_embeddings.cpu())
        all_embeddings_q2.append(q2_embeddings.cpu())
        all_features.append(batch_feat.cpu())

        # Clear GPU memory
        del q1_embeddings, q2_embeddings, features_tensor, fuzz_tensor, batch_feat
        torch.cuda.empty_cache()

    # Concatenate all batch results
    embeddings_q1 = torch.cat(all_embeddings_q1, dim=0)
    embeddings_q2 = torch.cat(all_embeddings_q2, dim=0)
    features = torch.cat(all_features, dim=0)
    
    # We pass all embeddings and feats as tensors, not numpy arrays like the ML models, to later feed to our LSTM layers
    return embeddings_q1, embeddings_q2, features



In [14]:
from torch.utils.data import Dataset

class SentencePairLSTMDataset(Dataset):
    def __init__(self, embeddings_q1, embeddings_q2, features, labels):
        self.embeddings_q1 = embeddings_q1  # tensor with shape (n_samples, embedding_dim)
        self.embeddings_q2 = embeddings_q2  # tensor with shape (n_samples, embedding_dim)
        self.features = features  # tensor with shape (n_samples, feature_dim)
        self.labels = torch.tensor(labels, dtype=torch.float32)  # tensor with shape (n_samples,)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'emb_q1': self.embeddings_q1[idx],
            'emb_q2': self.embeddings_q2[idx],
            'features': self.features[idx],
            'label': self.labels[idx]
        }

In [15]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim = 384, feature_dim = 772, lstm_hidden_dim=128, dropout=0.3): # Embedding dim for MiniLM transformer = 384. Feat dim = 772
        super(LSTMClassifier, self).__init__()
        # Input dimension: 2 * embedding_dim (q1 + q2) + feature_dim
        input_dim = 2 * embedding_dim + feature_dim
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, emb_q1, emb_q2, features):
        # Concatenate inputs along the feature dimension to combine all information.
        # torch.cat concatenates tensors along dim=1 (the feature/embedding dimension).
        combined = torch.cat([emb_q1, emb_q2, features], dim=1) # shape: (batch_size, embedding_dim + embedding_dim + feature_dim).
        
        # LSTM expects input shape: (batch_size, seq_len, input_dim).
        # currently is (batch_size, input_dim). Add a sequence dimension (seq_len=1).
        combined = combined.unsqueeze(1) # unsqueeze(1) adds a dimension at index 1, making shape: (batch_size, 1, input_dim).
        
        # Pass the combined tensor through the LSTM layer.
        # self.lstm is defined in __init__ as nn.LSTM(input_size=input_dim, hidden_size=lstm_hidden_dim)
        # Input shape: (batch_size, seq_len, input_dim)
        # Output: lstm_out is (batch_size, seq_len, lstm_hidden_dim)
        # _ contains hidden states (h_n, c_n), which we ignore since we only need the output
        lstm_out, _ = self.lstm(combined)
        
        # Extract the LSTM output for the last time step (seq_len=1, so only one step exists).
        # Indexing [:, -1, :] selects the last step: (batch_size, lstm_hidden_dim).
        lstm_out = lstm_out[:, -1, :]
        
        # Apply dropout for regularization to prevent overfitting.
        # self.dropout is defined in __init__ as nn.Dropout(dropout_rate), e.g., 0.3.
        # Input/output shape remains: (batch_size, lstm_hidden_dim), e.g., (32, 128).
        # Randomly zeros out elements with probability dropout_rate during training.
        out = self.dropout(lstm_out)
        
        # Pass through the fully connected (linear) layer for classification.
        # self.fc is defined in __init__ as nn.Linear(lstm_hidden_dim, 1).
        # Input shape: (batch_size, lstm_hidden_dim), e.g., (32, 128).
        # Output shape: (batch_size, 1), e.g., (32, 1), representing raw scores (logits).
        out = self.fc(out)
        
        # Apply sigmoid activation to convert logits to probabilities (0 to 1).
        # self.sigmoid is defined in __init__ as nn.Sigmoid().
        # Input shape: (batch_size, 1). Output shape: (batch_size, 1).
        # squeeze() removes dimensions of size 1, making shape: (batch_size,).
        # Example: (32, 1) -> (32,). This matches the expected shape for BCELoss.
        out = self.sigmoid(out).squeeze()
        
        # Return the final output: predicted probabilities for each sample in the batch.
        # Shape: (batch_size,), e.g., (32,). Values in [0, 1] for binary classification.
        return out

In [16]:
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

q1 = df['question1'].values
q2 = df['question2'].values
labels = df['is_duplicate'].values

# Split data
q1_train, q1_val, q2_train, q2_val, labels_train, labels_val = train_test_split(
    q1, q2, labels, test_size=0.2, random_state=42
)

# Get embeddings and features
emb_q1_train, emb_q2_train, features_train = sentence_bert_model_training(q1_train, q2_train)
emb_q1_val, emb_q2_val, features_val = sentence_bert_model_training(q1_val, q2_val)

# Creating datasets as tensors
train_dataset = SentencePairLSTMDataset(emb_q1_train, emb_q2_train, features_train, labels_train)
val_dataset = SentencePairLSTMDataset(emb_q1_val, emb_q2_val, features_val, labels_val)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Initializing model
embedding_dim = emb_q1_train.size(1)  
feature_dim = features_train.size(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(embedding_dim, feature_dim).to(device)

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Binary Crossentropy Loss is the Log Loss we are looking to monitor
criterion = nn.BCELoss()

# Training loop
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        emb_q1 = batch['emb_q1'].to(device)
        emb_q2 = batch['emb_q2'].to(device)
        features = batch['features'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(emb_q1, emb_q2, features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            emb_q1 = batch['emb_q1'].to(device)
            emb_q2 = batch['emb_q2'].to(device)
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(emb_q1, emb_q2, features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = (outputs > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return total_loss / len(data_loader), correct / total

# Train the model
epochs = 15
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Train Loss (Log Loss): {train_loss:.4f}')
    print(f'Val Loss (Log Loss): {val_loss:.4f}, Val Accuracy: {val_acc:.4f}')


Processing 323429 samples in 324 batches...


Feature Extraction Batches: 100%|██████████| 324/324 [19:53<00:00,  3.68s/it]


Processing 80858 samples in 81 batches...


Feature Extraction Batches: 100%|██████████| 81/81 [05:01<00:00,  3.73s/it]


Epoch 1/15
Train Loss (Log Loss): 0.3183
Val Loss (Log Loss): 0.2977, Val Accuracy: 0.8650
Epoch 2/15
Train Loss (Log Loss): 0.2840
Val Loss (Log Loss): 0.2814, Val Accuracy: 0.8737
Epoch 3/15
Train Loss (Log Loss): 0.2634
Val Loss (Log Loss): 0.2702, Val Accuracy: 0.8800
Epoch 4/15
Train Loss (Log Loss): 0.2461
Val Loss (Log Loss): 0.2666, Val Accuracy: 0.8826
Epoch 5/15
Train Loss (Log Loss): 0.2291
Val Loss (Log Loss): 0.2613, Val Accuracy: 0.8858
Epoch 6/15
Train Loss (Log Loss): 0.2136
Val Loss (Log Loss): 0.2638, Val Accuracy: 0.8864
Epoch 7/15
Train Loss (Log Loss): 0.1977
Val Loss (Log Loss): 0.2615, Val Accuracy: 0.8879
Epoch 8/15
Train Loss (Log Loss): 0.1836
Val Loss (Log Loss): 0.2621, Val Accuracy: 0.8896
Epoch 9/15
Train Loss (Log Loss): 0.1703
Val Loss (Log Loss): 0.2708, Val Accuracy: 0.8898
Epoch 10/15
Train Loss (Log Loss): 0.1580
Val Loss (Log Loss): 0.2724, Val Accuracy: 0.8905
Epoch 11/15
Train Loss (Log Loss): 0.1457
Val Loss (Log Loss): 0.2805, Val Accuracy: 0.89