In [1]:
from nltk.tokenize import word_tokenize
import contractions
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("quora.csv")
df.dropna(inplace=True)

df = df.drop(['id', 'qid1', 'qid2'], axis=1)

### Data Cleaning

Because we use contexual similarity models (BERT & Sentence-BERT embeddings), stopwords will not be removed, to preserve semantic and syntactic meaning. We can filter them out for TF-IDF

In [3]:
# stop_word = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at', 'to', 'of', 'is', 'it'}

# # Function to remove stopwords
# def remove_stopwords(text):
#     if not isinstance(text, str):
#         return ""
#     words = word_tokenize(text.lower())
#     filtered_words = [word for word in words if word not in stop_word]
#     return ' '.join(filtered_words)

# # Apply to both questions
# df['question1'] = df['question1'].apply(remove_stopwords)
# df['question2'] = df['question2'].apply(remove_stopwords)


#Contractions handling
def expand_contractions(text):
    return contractions.fix(text)

df['question1'] = df['question1'].apply(expand_contractions)
df['question2'] = df['question2'].apply(expand_contractions)


# Clean punctuation and special characters
def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

df['question1'] = df['question1'].apply(remove_special_characters)
df['question2'] = df['question2'].apply(remove_special_characters)

#Remove URLs and special Characters
def remove_urls(text):
    cleaned_text = re.sub(r'http\S+', '', text)
    return cleaned_text

df['question1'] = df['question1'].apply(remove_urls)
df['question2'] = df['question2'].apply(remove_urls)


#Remove html tags func
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    return cleaned_text

df['question1'] = df['question1'].apply(remove_html_tags)
df['question2'] = df['question2'].apply(remove_html_tags)

### Common Words - Feature

In [4]:
q1 = list(df.question1)
q2 = list(df.question2)


def common_words(q1, q2):
    length = len(q1) + len(q2)
    common = q1.intersection(q2)
    common_norm = len(common) / length
    return common_norm



### Sentence-BERT (Sentence embeddings for contexual similarity)

Pretrained Sentence-BERT model (all-MiniLM-L6-v2) from HuggingFace. Maps sentences to dense 384-dimensional vector embeddings that capture semantic meaning

In [5]:
from sentence_transformers import SentenceTransformer, util
from rapidfuzz.fuzz import ratio
import torch
from tqdm import tqdm


transformer = "all-mpnet-base-v2"

def compute_fuzz_features(q1, q2):
    fuzz_scores = []
    for q1, q2 in zip(q1, q2):
        fuzz_scores.append([ratio(q1, q2) / 100])  # Normalize to [0, 1]
    return torch.tensor(fuzz_scores, dtype=torch.float32)

# def sentence_bert_model_training(q1, q2):
#     '''BERT Sentence Transformer embedding'''
#     model = SentenceTransformer(transformer)

#     # Check if GPU is available and move model to GPU
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model = model.to(device)
    
#     # Encode all questions in batches
#     q1_embeddings = model.encode(df['question1'].tolist(), convert_to_tensor=True, batch_size=92, show_progress_bar= True)
#     q2_embeddings = model.encode(df['question2'].tolist(), convert_to_tensor=True, batch_size=92, show_progress_bar= True)

#     q1_tokenized = [set(word_tokenize(q.lower())) for q in q1]
#     q2_tokenized = [set(word_tokenize(q.lower())) for q in q2]

#     features = []
#     for emb1, emb2, q1_words, q2_words in zip(q1_embeddings, q2_embeddings, q1_tokenized, q2_tokenized):

#         # Compute cosine similarity
#         cosine_sim = util.cos_sim(emb1, emb2).item()  # float
#         cosine_sim = torch.tensor([cosine_sim], device=emb1.device)

#         # Create feature vector: [cosine_sim] + abs diff + elementwise product
#         diff = torch.abs(emb1 - emb2)
#         mult = emb1 * emb2

#         # Adding common word count normalized
#         common_word_count_norm = common_words(q1_words, q2_words)
#         common_word_count_norm = torch.tensor([common_word_count_norm], device=emb1.device, dtype=torch.float32)

#         # Length difference, normalized
#         word_count_diff = abs(len(q1_words) - len(q2_words))
#         max_word_count = max(len(q1_words), len(q2_words), 1)  # Avoiding division by zero by taking the greatest length instead
#         length_diff = word_count_diff / max_word_count
#         length_diff = torch.tensor([length_diff], device=emb1.device, dtype=torch.float32)

#         # Combine all features into one vector
#         feature_vector = torch.cat([cosine_sim, diff, mult, common_word_count_norm, length_diff])
#         features.append(feature_vector)



#     # Stacking features from list to 2D tensor
#     features_tensor = torch.stack(features)

#     # Fuzz ratio = for similarity/synonimity between pairs
#     fuzz_tensor = compute_fuzz_features(q1, q2).to(device)
#     feat = torch.cat([features_tensor, fuzz_tensor], dim=1)
    
#     return feat.cpu().numpy()


def sentence_bert_model_training(q1, q2, transformer='all-mpnet-base-v2', batch_size=1000):
    '''BERT Sentence Transformer embedding with batch processing'''
    model = SentenceTransformer(transformer)

    # Check if GPU is available and move model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Initialize lists to collect features
    all_features = []
    n_samples = len(q1)
    n_batches = int(np.ceil(n_samples / batch_size))

    print(f"Processing {n_samples} samples in {n_batches} batches...")
    for i in tqdm(range(n_batches), desc="Feature Extraction Batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, n_samples)
        q1_batch = q1[start_idx:end_idx]
        q2_batch = q2[start_idx:end_idx]

        # Encode questions in the current batch
        q1_embeddings = model.encode(q1_batch, convert_to_tensor=True, batch_size=92, show_progress_bar=False)
        q2_embeddings = model.encode(q2_batch, convert_to_tensor=True, batch_size=92, show_progress_bar=False)

        # Tokenize questions for lexical features
        q1_tokenized = [set(word_tokenize(q.lower())) for q in q1_batch]
        q2_tokenized = [set(word_tokenize(q.lower())) for q in q2_batch]

        batch_features = []
        for emb1, emb2, q1_words, q2_words in zip(q1_embeddings, q2_embeddings, q1_tokenized, q2_tokenized):
            # Compute cosine similarity
            cosine_sim = util.cos_sim(emb1, emb2).item()
            cosine_sim = torch.tensor([cosine_sim], device=emb1.device, dtype=torch.float32)

            # Create feature vector: [cosine_sim] + abs diff + elementwise product
            diff = torch.abs(emb1 - emb2)
            mult = emb1 * emb2

            # Compute common words count and normalize
            common_count = common_words(q1_words, q2_words)
            union_count = len(q1_words.union(q2_words))
            common_word_count_norm = common_count / union_count if union_count > 0 else 0
            common_word_count_norm = torch.tensor([common_word_count_norm], device=emb1.device, dtype=torch.float32)

            # Compute difference of word count and normalize
            word_count_diff = abs(len(q1_words) - len(q2_words))
            max_word_count = max(len(q1_words), len(q2_words), 1)
            length_diff = word_count_diff / max_word_count
            length_diff = torch.tensor([length_diff], device=emb1.device, dtype=torch.float32)

            # Combine features into one vector
            feature_vector = torch.cat([cosine_sim, diff, mult, common_word_count_norm, length_diff])
            batch_features.append(feature_vector)

        # Stack batch features and compute fuzz features
        features_tensor = torch.stack(batch_features)
        fuzz_tensor = compute_fuzz_features(q1_batch, q2_batch).to(device)
        batch_feat = torch.cat([features_tensor, fuzz_tensor], dim=1)

        # Move to CPU and append to all_features
        all_features.append(batch_feat.cpu())

        # Clear GPU memory
        del q1_embeddings, q2_embeddings, features_tensor, fuzz_tensor, batch_feat
        torch.cuda.empty_cache()

    # Concatenate all batch features on CPU
    feat = torch.cat(all_features, dim=0)
    return feat.numpy()



In [6]:
from sklearn.model_selection import train_test_split

y = df['is_duplicate'].values  # Labels (0 or 1)

# Extract features for the entire dataset
X = sentence_bert_model_training(q1, q2)

# Split the dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Processing 404287 samples in 405 batches...


Feature Extraction Batches: 100%|██████████| 405/405 [24:34<00:00,  3.64s/it]


In [None]:
from sklearn.metrics import log_loss, accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

import scipy.stats as stats

# Reduced dataset size for hyperparameter tuning to prevent crashes
subset_size = 0.3  # Using 30% of the data for hyperparameter tuning
X_train_subset, _, y_train_subset, _ = train_test_split(
    X_train, y_train, train_size=subset_size, stratify=y_train, random_state=42
)

model = LGBMClassifier(objective='binary', metric='binary_logloss')

# Define the parameter distribution
param_dist = {
    'learning_rate': stats.uniform(0.01, 0.29),  # Range: [0.01, 0.3]
    'num_leaves': [15, 31, 63, 127],  # Powers of 2 - 1, controlling tree complexity
    'n_estimators': [50, 100, 200, 300],  # Number of trees
    'max_depth': [3, 5, 7, 9, -1],  # -1 means no limit
    'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 3, 5],  # Small values for fine control
    'subsample': [0.6, 0.7, 0.8, 0.9],  # Fraction of samples
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],  # Fraction of features
    'reg_alpha': [0, 0.01, 0.1, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.01, 0.1, 1.0],  # L2 regularization
    'min_child_samples': [10, 20, 30, 50]  # Minimum samples in a leaf
}

# Setup the randomized search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, scoring='neg_log_loss', cv=3, verbose=1)

# Fit the randomized search
random_search.fit(X_train_subset, y_train_subset)

# Get the best parameters and model
print(f"Best Parameters: {random_search.best_params_}")
best_params = random_search.best_params_

best_model = LGBMClassifier(**best_params, eval_metric='logloss', use_label_encoder=False)



Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=0.8, learning_rate=0.03686562370169114, max_depth=7, min_child_weight=3, n_estimators=200, subsample=0.7; total time= 1.6min
[CV] END colsample_bytree=0.8, learning_rate=0.03686562370169114, max_depth=7, min_child_weight=3, n_estimators=200, subsample=0.7; total time= 1.5min
[CV] END colsample_bytree=0.8, learning_rate=0.03686562370169114, max_depth=7, min_child_weight=3, n_estimators=200, subsample=0.7; total time= 1.6min
[CV] END colsample_bytree=0.7, learning_rate=0.05823806190862759, max_depth=7, min_child_weight=3, n_estimators=50, subsample=0.8; total time=  28.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05823806190862759, max_depth=7, min_child_weight=3, n_estimators=50, subsample=0.8; total time=  28.6s
[CV] END colsample_bytree=0.7, learning_rate=0.05823806190862759, max_depth=7, min_child_weight=3, n_estimators=50, subsample=0.8; total time=  28.7s
[CV] END colsample_bytree=0.7, learn

In [None]:
best_model.fit(X_train, y_train, verbose = 2)


# Predict the test set
y_pred = best_model.predict(X_test)

# Predict probabilities for log loss calculation
y_pred_proba = best_model.predict_proba(X_test)

# Compute Log Loss
loss = log_loss(y_test, y_pred_proba)
print("Log Loss:", loss)
print("\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


best_model.save_model("lgbm_model_01.json")

#  Load the saved model for future use
# model_loaded = XGBClassifier()
# model_loaded.load_model("xgboost_model.json")

Log Loss: 0.30713916629328697


Accuracy: 0.8607558930470701
              precision    recall  f1-score   support

           0       0.91      0.87      0.89     51026
           1       0.79      0.84      0.82     29832

    accuracy                           0.86     80858
   macro avg       0.85      0.86      0.85     80858
weighted avg       0.86      0.86      0.86     80858

