NOTE: To load embeddings faster, recommend to use T4 GPU runtime.

# Libraries and data loading

In [1]:
!pip install -q sentence_transformers rank_bm25 nltk

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample

from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords', quiet=True)

True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Load the data
df_qq = pd.read_csv("/content/drive/MyDrive/Data/quora_questions.csv")

# View
display(df_qq)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


# Data analysis

In [None]:
print(f"Total number of rows (i.e. how many question pairs?): {len(df_qq)}\n")

print(f"Does the number of unique pairs match the number of rows?: {len(df_qq) == len(df_qq.groupby(['qid1', 'qid2']).count())}\n")

print(f"Number of unique questions: {len(set(df_qq['qid1'].unique()).union(set(df_qq['qid2'].unique())))}\n")

print(f"Number of duplicate pairs: {len(df_qq[df_qq['is_duplicate'] == 1])}")

Total number of rows (i.e. how many question pairs?): 404290

Does the number of unique pairs match the number of rows?: True

Number of unique questions: 537933

Number of duplicate pairs: 149263


# Retriever Implementation and Evaluation

The dataset is composed of sets of pairs of Qora questions with a label indicating if they are to be considered duplicates or not.

An efficient retriever shall retrieve the relevant question out of all the questions.
The retreivers should be evaluated using **at least the Hit Rate and possibly the MAPS**.

In order to evaluate the retrieving methods, you should create the following data sets:

1. A base of unique question with their corresponding ``qid``;
2. A key-value dataframe matching ``qid`` pairs to a ``is_duplicate`` label.

Considering, the number of unique questions you may subsample the dataset to a manageable size.

## 1) A base of unique questions with their corresponding ``qid``


In [None]:
# Daftaframe with questions from 'qid1'
df_1 = df_qq[['qid1', 'question1']].rename(columns={'qid1': 'qid', 'question1': 'question'})

# Dataframe with questions from 'qid2'
df_2 = df_qq[['qid2', 'question2']].rename(columns={'qid2': 'qid', 'question2': 'question'})

# Union set
df_q = pd.concat((df_1, df_2), axis=0).drop_duplicates()

# Verification
print(f"Number of unique questions: {len(df_q)}")

Number of unique questions: 537933


In [None]:
# View the dataframe
display(df_q)

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,5,How can I increase the speed of my internet co...
3,7,Why am I mentally very lonely? How can I solve...
4,9,"Which one dissolve in water quikly sugar, salt..."
...,...,...
404283,537925,What will the CPU upgrade to the 2016 Apple Ma...
404284,537927,What does Jainism say about Gays and Homosexua...
404287,537929,What's this coin?
404288,537931,I am having little hairfall problem but I want...


## 2) A key-value dataframe matching ``qid`` pairs to a ``is_duplicate`` label

In [None]:
df_labels = df_qq[['qid1', 'qid2', 'is_duplicate']]

In [None]:
# View dataframe
display(df_labels)

Unnamed: 0,qid1,qid2,is_duplicate
0,1,2,0
1,3,4,0
2,5,6,0
3,7,8,0
4,9,10,0
...,...,...,...
404285,433578,379845,0
404286,18840,155606,1
404287,537928,537929,0
404288,537930,537931,0


## Data subsampling

Subsample the data so that you have a smaller dataset.

1. Select a set of questions;
2. Select the queries as questions from the previous set having at least 1 positive duplicate;
3. Be sure that the ``qid`` of the selected questions are not part of the set of questions that will be indexed in the k-NN for the retriever.

In [None]:
# Randomly sample 1000 unique question IDs (qid1) from pairs that are marked as duplicates.
potential_queries_indices = resample(
    df_labels.loc[df_labels['is_duplicate'] == 1, 'qid1'].unique(),
    replace=False,
    n_samples=1000
)

# Collect the corresponding qid2 values for those sampled qid1s (only from duplicate pairs).
correct_outputs = df_labels.loc[
    (df_labels['is_duplicate'] == 1) &
    (df_labels['qid1'].isin(potential_queries_indices)),
    'qid2'
].unique()

# Define the "true positive basis" as all qid2s that are duplicates, excluding the sampled qid1s themselves.
true_positive_basis = set(correct_outputs) - set(potential_queries_indices)

# Randomly sample 10,000 "negative" examples (false cases) from all qids that are NOT in the true positive set.
false_examples = resample(
    df_q.loc[~df_q['qid'].isin(true_positive_basis), 'qid'].values,
    n_samples=10000,
    replace=False
)

# Check how many false examples were sampled.
print(f"Number of false examples that were sampled: {len(false_examples)}\n")

# Merge true positives and false examples into a single array these are the IDs to build the vector database on.
all_ids = np.concatenate((list(true_positive_basis), false_examples))

# Check total number of IDs.
print(f"Total number of IDs: {len(all_ids)}")

Number of false examples that were sampled: 10000

Total number of IDs: 11662


In [None]:
# Build dataframe with the qid1 duplicate querries
questions_with_duplicates = df_q.loc[df_q['qid'].isin(potential_queries_indices)]

# View
display(questions_with_duplicates)

Unnamed: 0,qid,question
331,661,What are the pros and cons of the German Sheph...
401,800,Why do dreams look so real?
722,1440,How will I contact a good hacker?
919,1833,What are some really good and famous romantic ...
1007,2009,What are the chances of ww3?
...,...,...
402365,535838,When I told best friend how I felt she cried a...
402876,536419,Which are the best songs that have featured on...
403914,537533,Which programing language(s) is(are) widely us...
404073,537708,Are Pisces attracted to Scorpio?


In [None]:
# Build dataframe with the ids of true and false positives
df_chunks = df_q.loc[df_q['qid'].isin(all_ids)]

# View
display(df_chunks)

Unnamed: 0,qid,question
75,151,If I fire a bullet backward from an aircraft g...
135,271,Who are the Rohingya Muslims?
159,319,Why nobody answer my questions in Quora?
318,636,How can I become a billionaire?
359,716,How racist is too racist?
...,...,...
404073,537709,Are Pisces equally attracted to Scorpio?
404137,537785,How can I get the notes for Kannada literature...
404186,537838,Which is the best deals site for online shopping?
404268,537910,Should I raise my young child on 80's music?


# Evaluation functions

Define the relevant functions:

1. Match index from questions to labels (by default, if the index pair is not in the labelling table, consider that the the pair is not a duplicate);
2. Compute the metrics;
3. Evaluation loop over the relevant questions.

For hit, average precision and MAPS, info [here](https://www.evidentlyai.com/ranking-metrics/mean-average-precision-map).

In [None]:
# Compute Hit
def hits(list_labels):
    return 1 if 1 in list_labels else 0

In [None]:
# Compute average precision
def ap(list_labels):
    if sum(list_labels) == 0:
        return 0
    precision_at_k = [sum(list_labels[0:n+1])/(n+1) for n in range(0, len(list_labels))]
    return 1/sum(list_labels)*sum([p*l for p, l in zip(precision_at_k, list_labels)])

In [None]:
# Function that computes all metrics
def compute_metrics(list_labels):
    hits_list = [hits(l) for l in list_labels]
    ap_list = [ap(l) for l in list_labels]
    return {'hit_rate': sum(hits_list)/len(hits_list), 'map': sum(ap_list)/len(ap_list)}

In [None]:
# Dummy example to test
test_output = [1,0,0,1,1,0]

print(f"Test hits function (expected result = 1): {hits(test_output)}")
print(f"Test average precision funciton (expected result = 0.7): {ap(test_output)}")

Test hits function (expected result = 1): 1
Test average precision funciton (expected result = 0.7): 0.7


# Retriever implementation (First pipeline)

In this first Pipeline we are going to use a simple sklearn pipeline for seamntic search.

- Use the `sentence_transformers` package to load a model and embed the reviews;
- [`NearestNeighbors`](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors) class for the search engine.

In [None]:
import logging

# Suppress HF logging
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

## Load model and fit KNN

In [None]:
# Load sentence_transfromers model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [None]:
# Compute embeddings for df_chunks and save
df_chunks = df_chunks.reset_index(drop=True)

# Encode the 'question' column into embeddings using SBERT
embeddings = sbert_model.encode(
    df_chunks['question'].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    device=device
)

# Store embeddings back into the dataframe as a new column
df_chunks['embeddings'] = embeddings.tolist()

Batches:   0%|          | 0/183 [00:00<?, ?it/s]

In [None]:
display(df_chunks)

Unnamed: 0,qid,question,embeddings
0,151,If I fire a bullet backward from an aircraft g...,"[0.05991332605481148, 0.026371775195002556, -0..."
1,271,Who are the Rohingya Muslims?,"[0.02800741232931614, 0.07244265824556351, -0...."
2,319,Why nobody answer my questions in Quora?,"[0.0285966657102108, -0.061735786497592926, -0..."
3,636,How can I become a billionaire?,"[0.07048819959163666, 0.004598948173224926, -0..."
4,716,How racist is too racist?,"[-0.010313782840967178, 0.07725974172353745, -..."
...,...,...,...
11657,537709,Are Pisces equally attracted to Scorpio?,"[-0.0333033986389637, -0.03911645710468292, -0..."
11658,537785,How can I get the notes for Kannada literature...,"[-0.12579362094402313, -0.011148395948112011, ..."
11659,537838,Which is the best deals site for online shopping?,"[-0.03581434115767479, -0.05307937413454056, -..."
11660,537910,Should I raise my young child on 80's music?,"[0.056499216705560684, 0.055709559470415115, 0..."


In [None]:
# Fit the KNN only once
nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute').fit(embeddings)

## `predict_duplicates` function and testing

In [None]:
def predict_duplicates(query, model, df, knn):
    """
    Function to predict the duplicated questions based on a sentence transformer model

    Args:
    - query: reference question
    - model: sentence transformer model
    - df: dataframe containing the questions and their embeddings
    - knn: k nearest neighbors model

    Returns:
    - df: dataframe containing the 5 most similar questions
    """
    # Get the embedding of the querry
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Find the 5 nearest neighbors
    distances, indices = knn.kneighbors(query_embedding.cpu().numpy().reshape(1, -1))

    # Return the lines of df_chunks associated to
    return df.iloc[indices[0]]

In [None]:
# Test the code on a random question
qq = questions_with_duplicates.sample(n=1).iloc[0]
print(f"Querry: {qq['question']} (ID: {qq['qid']})")

Querry: Why do people want to get out of jury duty? (ID: 442184)


In [None]:
# Find this random question's duplicate in qid2
dupe_id = df_labels.loc[(df_labels['is_duplicate'] == 1) & (df_labels['qid1'] == qq.qid), 'qid2'].iloc[0]  # type: ignore

# Locate it in df_chunks
dupe_in_df_chunks = df_chunks[df_chunks['qid'] == dupe_id].iloc[0]

# Print dupe question
print(f"Duplicate: {dupe_in_df_chunks['question']} (ID: {dupe_in_df_chunks['qid']})")

Duplicate: Why don't people like jury duty? (ID: 242867)


In [None]:
# Predict and verify we get the same result
predict_duplicates(qq['question'], sbert_model, df_chunks, nbrs)

Unnamed: 0,qid,question,embeddings
2796,242867,Why don't people like jury duty?,"[0.0028246294241398573, 0.004321648273617029, ..."
5467,474472,Why do people run from their responsibilities?,"[0.004954342730343342, 0.0318809449672699, 0.0..."
7328,123561,Why did you go to prison?,"[0.004907694645226002, 0.08137308061122894, -0..."
11375,513356,What are the 12 steps in a criminal trial?,"[-0.0006825905875302851, 0.12014854699373245, ..."
6714,60440,Why are mail order brides legal?,"[-0.0419638454914093, 0.06471752375364304, 0.0..."


## `get_labels` funciton

In [None]:
def get_label(qid1, qid2, df_labels):
    result = df_labels.loc[(df_labels['qid1'] == qid1) & (df_labels['qid2'] == qid2), 'is_duplicate']
    if result.empty:
        # If the pair is not found in df_labels, assume it's not a duplicate
        return 0
    else:
        return result.iloc[0]

In [None]:
# Test the function with the predictions
preds = predict_duplicates(qq['question'], sbert_model, df_chunks, nbrs)
labels = [get_label(qq['qid'], r['qid'], df_labels) for _, r in preds.iterrows()]  # type: ignore

print(f"Labels: {labels}")

Labels: [np.int64(1), 0, 0, 0, 0]


In [None]:
# Test evaluation
print(f"Test hits function with previous predictions: {hits(labels)}")
print(f"Test average precision funciton with previous predictions: {ap(labels)}")

Test hits function with previous predictions: 1
Test average precision funciton with previous predictions: 1.0


## Evaluation

In [None]:
def evaluate_queries(queries, model, df, knn):
    all_labels = []

    for id, question in tqdm(queries.iterrows()):
        query = question['question']
        results = predict_duplicates(query, model, df, knn)
        labels = [get_label(question['qid'], r['qid'], df_labels) for _, r in results.iterrows()]
        all_labels.append(labels)

    return all_labels

In [None]:
test_labels = evaluate_queries(questions_with_duplicates.iloc[:500], sbert_model, df_chunks, nbrs)

0it [00:00, ?it/s]

In [None]:
hit, map = compute_metrics(test_labels).values()

print(f"Hit rate: {hit}")
print(f"Mean average precision: {map}")

Hit rate: 0.998
Mean average precision: 0.9602472222222219


# Second pipleine (BM25)

After using distance based searchs methods, we are going to work with the BM25 algorithm. It was developped in late 70s but it is still very relevant. Some preprocessing of the text will be needed. A classical pipeline will be implemented.

Create a preprocessing function that follows these steps:

1. Stopwords removal;
2. Stemming.

In [None]:
def pre(string):
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))

    # Stem string
    string = ' '.join([stemmer.stem(word) for word in string.split()])

    # Remove stopwords
    string = ' '.join([word for word in string.split() if word not in stop_words])

    return string

In [None]:
# Test preprocessing function
df_pre = df_chunks['question'].apply(lambda x: pre(x))
display(pd.DataFrame({
    "before_proc": df_chunks["question"].values,
    "after_proc": df_pre.values
}))

Unnamed: 0,before_proc,after_proc
0,If I fire a bullet backward from an aircraft g...,fire bullet backward aircraft go faster bullet...
1,Who are the Rohingya Muslims?,rohingya muslims?
2,Why nobody answer my questions in Quora?,whi nobodi answer question quora?
3,How can I become a billionaire?,becom billionaire?
4,How racist is too racist?,racist racist?
...,...,...
11657,Are Pisces equally attracted to Scorpio?,pisc equal attract scorpio?
11658,How can I get the notes for Kannada literature...,get note kannada literatur upsc main exam?
11659,Which is the best deals site for online shopping?,best deal site onlin shopping?
11660,Should I raise my young child on 80's music?,rais young child 80 music?


## BM25 engine

Preprocess the reference data and index them with the [BM25Okapi](https://github.com/dorianbrown/rank_bm25) class.

In [None]:
# Create the BM25 Okapi engine
tokenized_corpus = df_pre.apply(lambda x: x.split())
bm25_model = BM25Okapi(tokenized_corpus)

## Queries


In [None]:
def predict_duplicates_bm25(query, df, model):
    # Preprocess the query
    preprocessed_query = pre(query).split()

    # Find top-n matches
    best_matches = model.get_top_n(preprocessed_query, df.index, n=5)

    # Return the row of the matches
    return df.loc[best_matches]

In [None]:
qq = questions_with_duplicates.sample(n=1).iloc[0]
print(f"Querry: {qq['question']} (ID: {qq['qid']})")

# Test predict_duplicates_bm25
matches = predict_duplicates_bm25(qq['question'], df_pre, bm25_model)
print(matches)

Querry: What are negative impact of demonetization? (ID: 234481)
8385              negat impact demonetization?
4821     rbi lost credibl post demonetization?
808                     physics, negat energy?
11466                       negat consequ tpp?
8951              whi negat freedom important?
Name: question, dtype: object


In [None]:
def evaluate_queries_bm25(queries, model, df):
    all_labels = []

    for id, question in tqdm(queries.iterrows()):
        query = question['question']
        results = predict_duplicates_bm25(query, df, model)
        labels = [get_label(question['qid'], r['qid'], df_labels) for _, r in results.iterrows()]
        all_labels.append(labels)

    return all_labels

In [None]:
# Evaluate and compute metrics
test_labels_bm25 = evaluate_queries_bm25(questions_with_duplicates.iloc[:500], bm25_model, df_chunks)

0it [00:00, ?it/s]

In [None]:
hit, map = compute_metrics(test_labels_bm25).values()

print(f"Hit rate: {hit}")
print(f"Mean average precision: {map}")

Hit rate: 0.956
Mean average precision: 0.8625638888888888


# Conclusion

Explain the results:

- How does the different methods compare?
- Can you propose an approach using metadata such as themes, entities, authors?
- How would you implement it?