In [2]:
#%pip install --upgrade transformers datasets accelerate deepspeed
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import datasets
import tqdm 
import numpy as np
import torchmetrics
import os
import time
import psutil
import platform
import cpuinfo
!pip install GPUtil
import GPUtil
from tabulate import tabulate
import pandas as pd

[0m

### Load Data and Model

In [3]:
qqp = datasets.load_dataset('SetFit/qqp')
print('\n')
print("Sample[0]:", qqp['train'][0])
print("Sample[3]:", qqp['train'][3])

Downloading and preparing dataset json/SetFit--qqp to /root/.cache/huggingface/datasets/json/SetFit--qqp-94258451190e12bb/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/70.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.83M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/SetFit--qqp-94258451190e12bb/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]



Sample[0]: {'text1': 'How is the life of a math student? Could you describe your own experiences?', 'text2': 'Which level of prepration is enough for the exam jlpt5?', 'label': 0, 'idx': 0, 'label_text': 'not duplicate'}
Sample[3]: {'text1': 'What can one do after MBBS?', 'text2': 'What do i do after my MBBS ?', 'label': 1, 'idx': 3, 'label_text': 'duplicate'}


In [4]:
model_name = "gchhablani/bert-base-cased-finetuned-qqp"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

### Tokenize Data

In [5]:
MAX_LENGTH = 128
def preprocess_function(examples):
    result = tokenizer(
        examples['text1'], examples['text2'],
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    result['label'] = examples['label']
    return result

qqp_preprocessed = qqp.map(preprocess_function, batched=True)

In [5]:
print(repr(qqp_preprocessed['train'][0]['input_ids'])[:100], "...")

[101, 1731, 1110, 1103, 1297, 1104, 170, 12523, 2377, 136, 7426, 1128, 5594, 1240, 1319, 5758, 136,  ...


### Task 1: evaluation (1 points)


In [7]:
BATCH_SIZE = 256
NUM_WORKERS = 2
DEVICE = 'cuda'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

val_set = qqp_preprocessed['validation']
val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=BATCH_SIZE, shuffle=False, collate_fn=transformers.default_data_collator,
    num_workers = NUM_WORKERS
)

preds = []
labels = []
with torch.no_grad():
    model.to(DEVICE)
    for batch in tqdm.tqdm(val_loader):
        prediction = model(
          input_ids=batch['input_ids'].to(DEVICE),
          attention_mask=batch['attention_mask'].to(DEVICE),
          token_type_ids=batch['token_type_ids'].to(DEVICE)
        )
        labels.extend(batch['labels'].data.numpy())
        prediction = torch.softmax(prediction.logits, dim = 1).cpu().data.numpy()
        preds.extend(prediction)
    

preds = torch.tensor(preds)
target = torch.nn.functional.one_hot(torch.tensor(labels))
accuracy = torchmetrics.functional.classification.accuracy(preds = torch.tensor(preds),
                                                target = target,
                                                task = 'binary', top_k = 1).numpy()
print("Accuracy:", accuracy)

100%|██████████| 158/158 [02:13<00:00,  1.19it/s]

Accuracy: 0.90838486



  preds = torch.tensor(preds)
  accuracy = torchmetrics.functional.classification.accuracy(preds = torch.tensor(preds),


In [8]:
assert 0.9 < accuracy < 0.91

### Task 2: train the model (5 points) -- Option B

I take **Option B** and compare the following models:
- ernie `rajiv003/ernie-finetuned-qqp`
- DeBERTa `Tomor0720/deberta-base-finetuned-qqp`
- XLNet `vkk1710/xlnet-base-cased-finetuned-qqp`

#### Hardware Setup

In [7]:
# 99% stolen from https://www.thepythoncode.com/article/get-hardware-system-information-python

def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor

print("="*40, "System Information", "="*40)
uname = platform.uname()
print(f"System: {uname.system}")
print(f"Version: {uname.version}")
print(f"Machine: {uname.machine}")
print(f"Processor: {uname.processor}")
print(f"Python Version: {cpuinfo.get_cpu_info()['python_version']}")

print("="*40, "CPU Info", "="*40)
print("CPU:", cpuinfo.get_cpu_info().get('brand_raw'))
print("Physical cores:", psutil.cpu_count(logical=False))
print("Total cores:", psutil.cpu_count(logical=True))

print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}")
print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}")

print("="*40, "GPU Details", "="*40)
gpus = GPUtil.getGPUs()
list_gpus = []
for gpu in gpus:
    gpu_id = gpu.id
    gpu_name = gpu.name
    gpu_load = f"{gpu.load*100}%"
    gpu_free_memory = f"{gpu.memoryFree}MB"
    gpu_used_memory = f"{gpu.memoryUsed}MB"
    gpu_total_memory = f"{gpu.memoryTotal}MB"
    gpu_temperature = f"{gpu.temperature} °C"
    gpu_uuid = gpu.uuid
    list_gpus.append((
        gpu_id, gpu_name, gpu_load, gpu_free_memory, gpu_used_memory,
        gpu_total_memory, gpu_temperature, gpu_uuid
    ))

print(tabulate(list_gpus, headers=("id", "name", "load", "free memory", "used memory", "total memory",
                                   "temperature", "uuid")))

System: Linux
Version: #1 SMP Thu Apr 27 10:55:14 UTC 2023
Machine: x86_64
Processor: x86_64
Python Version: 3.10.10.final.0 (64 bit)
CPU: Intel(R) Xeon(R) CPU @ 2.00GHz
Physical cores: 1
Total cores: 2
Total: 15.63GB
Available: 13.14GB
Used: 2.17GB
  id  name                  load    free memory    used memory    total memory    temperature    uuid
----  --------------------  ------  -------------  -------------  --------------  -------------  ----------------------------------------
   0  Tesla P100-PCIE-16GB  0.0%    16280.0MB      0.0MB          16280.0MB       34.0 °C        GPU-c50c057d-272c-96e5-bf24-5eddfc8cfa57


#### Functions to Run Benchmarks

In [7]:
def preprocess_function(examples, tokenizer):
    result = tokenizer(
        examples['text1'], examples['text2'],
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    result['label'] = examples['label']
    return result

def benchmark_model(model, tokenizer):
    result_dict = {}
    
    print("="*40, "Preprocessing.", "="*40)
    qqp_preprocessed = qqp.map(lambda x: preprocess_function(x, tokenizer), 
                               batched=True)
    val_set = qqp_preprocessed['validation']
    val_loader = torch.utils.data.DataLoader(
        val_set, batch_size=BATCH_SIZE, shuffle=False, collate_fn=transformers.default_data_collator,
        num_workers = NUM_WORKERS
        )
    
    print("="*40, "Running Model.", "="*40)
    start_time = time.time()
    preds = []
    labels = []
    with torch.no_grad():
        model.to(DEVICE)
        for batch in tqdm.tqdm(val_loader):
            prediction = model(
            input_ids=batch['input_ids'].to(DEVICE),
            attention_mask=batch['attention_mask'].to(DEVICE),
            token_type_ids=batch['token_type_ids'].to(DEVICE)
            )
            labels.extend(batch['labels'].data.numpy())
            prediction = torch.softmax(prediction.logits, dim = 1).cpu().data.numpy()
            preds.extend(prediction)
    result_dict['total_time'] = time.time() - start_time
    
    print("="*40, "Estimating Accuracy.", "="*40)
    preds = torch.tensor(preds)
    target = torch.nn.functional.one_hot(torch.tensor(labels))
    accuracy = torchmetrics.functional.classification.accuracy(preds = torch.tensor(preds),
                                                    target = target,
                                                    task = 'binary', top_k = 1).numpy()
    result_dict['accuracy'] = accuracy
    return result_dict

#### Hyperparameters

In [3]:
BATCH_SIZE = 256
NUM_WORKERS = 2
DEVICE = 'cuda' 
MAX_LENGTH = 128
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### Models

In [10]:
model_name_1 = "rajiv003/ernie-finetuned-qqp"
tokenizer_1 = transformers.AutoTokenizer.from_pretrained(model_name_1)
model_1 = transformers.AutoModelForSequenceClassification.from_pretrained(model_name_1)
result_1 = benchmark_model(model_1, tokenizer_1)
result_1['name'] = model_name_1
result_1['size'] = '438 Mbs'

Downloading (…)okenizer_config.json:   0%|          | 0.00/377 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]



  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]



100%|██████████| 158/158 [02:14<00:00,  1.17it/s]




  preds = torch.tensor(preds)
  accuracy = torchmetrics.functional.classification.accuracy(preds = torch.tensor(preds),


In [11]:
model_name_2 = "Tomor0720/deberta-base-finetuned-qqp"
tokenizer_2 = transformers.AutoTokenizer.from_pretrained(model_name_2)
model_2 = transformers.AutoModelForSequenceClassification.from_pretrained(model_name_2)
result_2 = benchmark_model(model_2, tokenizer_2)
result_2['name'] = model_name_2
result_2['size'] = '557 Mbs'

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/557M [00:00<?, ?B/s]



  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]



100%|██████████| 158/158 [02:50<00:00,  1.08s/it]




  accuracy = torchmetrics.functional.classification.accuracy(preds = torch.tensor(preds),


In [12]:
model_name_3 = "vkk1710/xlnet-base-cased-finetuned-qqp"
tokenizer_3 = transformers.AutoTokenizer.from_pretrained(model_name_3)
model_3 = transformers.AutoModelForSequenceClassification.from_pretrained(model_name_3)
result_3 = benchmark_model(model_3, tokenizer_3)
result_3['name'] = model_name_3
result_3['size'] = '469 Mbs'

Downloading (…)okenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/469M [00:00<?, ?B/s]



  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]



100%|██████████| 158/158 [03:47<00:00,  1.44s/it]




  accuracy = torchmetrics.functional.classification.accuracy(preds = torch.tensor(preds),


#### Results


In [50]:
import pandas as pd
import torchinfo

result_1['params(in Mil.)'] = torchinfo.summary(model_1).total_params/1e6
result_2['params(in Mil.)'] = torchinfo.summary(model_2).total_params/1e6
result_3['params(in Mil.)'] = torchinfo.summary(model_3).total_params/1e6

results = pd.concat([pd.DataFrame(result_1, index = [0]),
           pd.DataFrame(result_2, index = [0]),
           pd.DataFrame(result_3, index = [0])], axis = 0).reset_index(drop=True)
results.index = results['name']
del results['name']
results['batchs/sec'] = 159/results['total_time']
results = results.loc[:, ['accuracy', 'batchs/sec', 'params(in Bil.)', 'size']]
results.sort_values('accuracy', ascending=False)

Unnamed: 0_level_0,accuracy,batchs/sec,params(in Bil.),size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rajiv003/ernie-finetuned-qqp,0.915211,1.144554,109.485314,438 Mbs
Tomor0720/deberta-base-finetuned-qqp,0.912763,0.931496,139.193858,557 Mbs
vkk1710/xlnet-base-cased-finetuned-qqp,0.908385,0.697106,117.310466,469 Mbs


As evident from the table above, all models perform comparable to base BERT specification with XLNet having almost identical performance on QQP. The best model of 3 compared is clearly the ERNIE as strictly dominates other models both in terms of accuracy, speed and size. 

### Task 3: try the full pipeline (2 points)

Finally, it is time to use your model to find duplicate questions. Please implement a function that takes a question and finds top-5 potential duplicates in the training set. For now, it is fine if your function is slow, as long as it yields correct results.

Showcase how your function works with at least 5 examples.

I will use **ERNIE** since it performs best.

0. Running the loop honestly is very time consuming ($\approx$ 27 minutes). 
1. Instead of doing it, I cheat a little bit and **sample batches** of given size $B$ from the train questions list $n$ times.
    - Gives lower scores for matches, but actually sentences seem to be more related
2. Instead of random sampling we can make procedure more intelligent by firstly making a restricted set of potential duplicates based on the common words (**Jaccard Distance**)
    - Gives higher socres for matches, but sentences are poorly related

It turns out that **random sampling is better than using Jaccard Distance**. Moreover, obviously, as the number of sample increases, the method gets closer to fair approach.

In [24]:
MAX_LENGTH = 128
BATCH_SIZE = 256
DEVICE = 'cuda'
model_name_1 = "rajiv003/ernie-finetuned-qqp"
tokenizer_1 = transformers.AutoTokenizer.from_pretrained(model_name_1)
model_1 = transformers.AutoModelForSequenceClassification.from_pretrained(model_name_1).to(DEVICE)

In [25]:
# sample of all train questions
train_questions = []
for x in tqdm.tqdm(qqp['train']):
    train_questions.extend([x['text1'], x['text2']])
train_questions = list(set(train_questions))

100%|██████████| 363846/363846 [00:28<00:00, 12753.48it/s]


In [26]:
def process_batch(k, question, model, tokenizer):
    """
        * k -- index of batch
    """
    candidates = train_questions[k*BATCH_SIZE: (k+1)*BATCH_SIZE]
    questions = [question]*len(candidates)
    pair = tokenizer(questions, candidates,  padding='max_length', 
                         max_length=MAX_LENGTH, truncation=True)
    with torch.no_grad():
        logits = model(input_ids=torch.tensor(pair['input_ids']).to(DEVICE),
                        attention_mask=torch.tensor(pair['attention_mask']).to(DEVICE),
                        token_type_ids=torch.tensor(pair['token_type_ids']).to(DEVICE)
                     ).logits
        score = torch.softmax(logits, dim = 1).cpu().numpy()
    return score


In [34]:
import pandas as pd
import multiprocess as mp
import random

def common_words(question, candidate):
    """
    This is essentially jaccard similarity
    """
    total = len(set(question.split()).union(set(candidate.split())))
    match = len(set(question.split()).intersection(set(candidate.split())))
    return match/total

def process_question(question, model, tokenizer, 
                     method: str, # fair, sampling, restricted
                     n_samples: int = 5, # number of samples for sampling methods 
                     r_size: int = 5000): # size of restricted sample
    
    population = train_questions
    scores = []
    k_max = int(len(population)/BATCH_SIZE) + 1
    if method == 'fair':
        for k in tqdm.trange(0, k_max):
            scores_batch = process_batch(k, question, model, tokenizer)
            scores.extend(scores_batch)
    elif method == 'sampling':
        assert isinstance(n_samples, int)
        assert 0 < n_samples <= k_max
        for k in tqdm.trange(n_samples):
            sample = random.sample(population, BATCH_SIZE)
            # update population by removing sample
            population = list(set(population).difference(set(sample)))
            # estimate batch
            scores_batch = process_batch(k, question, model, tokenizer)
            scores.extend(scores_batch)
    elif method == 'restricted':
        assert isinstance(r_size, int)
        assert 1 < r_size <= len(population)
        word_corr = np.array([common_words(question, x) for x in population])
        candidates = np.array(population)[np.argsort(-word_corr)[:r_size]]
        questions = [question]*len(candidates)
        pair = tokenizer(list(questions), list(candidates),  padding='max_length', 
                         max_length=MAX_LENGTH, truncation=True)
        k_max = int(len(candidates)/BATCH_SIZE) + 1
        with torch.no_grad():
            for k in tqdm.trange(k_max):
                pair = tokenizer(list(questions)[k*BATCH_SIZE: (k+1)*BATCH_SIZE], 
                                 list(candidates)[k*BATCH_SIZE: (k+1)*BATCH_SIZE],  
                                 padding='max_length', max_length=MAX_LENGTH, truncation=True)
                logits = model(input_ids=torch.tensor(pair['input_ids']).to(DEVICE),
                                attention_mask=torch.tensor(pair['attention_mask']).to(DEVICE),
                                token_type_ids=torch.tensor(pair['token_type_ids']).to(DEVICE)
                             ).logits
                scores_batch = torch.softmax(logits, dim = 1).cpu().numpy()
                scores.extend(scores_batch)
        
    scores_positive = np.array(scores)[:, 1]
    top5_candidates = np.array(train_questions)[np.argsort(-scores_positive)[:4]]
    top5_scores = scores_positive[np.argsort(-scores_positive)[:4]]
    output = pd.DataFrame(dict(zip(top5_candidates, top5_scores)), index = [0]).T.reset_index()
    output.columns = ['Candidate', 'Score']
    return output       


In [35]:
example_ids = np.random.randint(low = 0, high = len(train_questions)-1, size = 5)
examples = np.array(train_questions)[example_ids]

**Fair Approach**

In [222]:
print("="*40, examples[0], "="*40)
process_question(examples[0], model_1, tokenizer_1, 
                 method = 'fair')



100%|██████████| 1930/1930 [28:23<00:00,  1.13it/s]


Unnamed: 0,Candidate,Score
0,What aspects of English do non-native find dif...,0.990457
1,What's the most difficult language for a nativ...,0.978208
2,What is the hardest language for a native Engl...,0.970506
3,What aspects of English do non-native find the...,0.950073


**Random Sampling Approach**

In [221]:
print("="*40, examples[0], "="*40)
process_question(examples[0], model_1, tokenizer_1, 
                 method = 'sampling', n_samples = 100)



100%|██████████| 100/100 [01:45<00:00,  1.06s/it]


Unnamed: 0,Candidate,Score
0,Is English a difficult or easy language to learn?,0.007197
1,Which language should be learned first ?,0.000598
2,What is the best way to learn spoken English w...,0.000548
3,What can I do to improve my English speaking?,0.000522


**Jaccard Distance Approach**

In [218]:
print("="*40, examples[0], "="*40)
process_question(examples[0], model_1, tokenizer_1, 
                 method = 'restricted', r_size = 25_000)



100%|██████████| 98/98 [01:35<00:00,  1.03it/s]


Unnamed: 0,Candidate,Score
0,What is substitution?,0.978208
1,How would you know if you are a loner?,0.970506
2,How can you deal with the street lights while ...,0.010496
3,How does the universe and quantum physics play...,0.009481
