In [1]:
import glob
import json
import torch
import flash_attn
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, confusion_matrix

model_used = 'bge-v2-m3'

In [2]:
# load reranker model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    "BAAI/bge-reranker-v2-m3",
    torch_dtype = torch.bfloat16,
    use_cache = False
)

In [3]:
# push model to the GPU!
_ = reranker_model.cuda().eval()

In [4]:
!nvidia-smi

Sun Apr 21 11:44:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   34C    P0              62W / 300W |   2616MiB / 81920MiB |     42%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

### Inference & Evaluation

In [5]:
# list down available files
files = sorted(glob.glob('test_datasets/test-reformatted-*.jsonl'))
files

['test_datasets/test-reformatted-common-crawl-qa.jsonl',
 'test_datasets/test-reformatted-facebook.jsonl',
 'test_datasets/test-reformatted-hansard-qa.jsonl',
 'test_datasets/test-reformatted-iium-confession.jsonl',
 'test_datasets/test-reformatted-mining-b-cari-com-my.jsonl',
 'test_datasets/test-reformatted-mining-summarization.jsonl',
 'test_datasets/test-reformatted-news.jsonl',
 'test_datasets/test-reformatted-twitter.jsonl',
 'test_datasets/test-reformatted-wikipedia-qa.jsonl']

#### 1.0 Common Crawl QA

In [6]:
f = "test_datasets/test-reformatted-common-crawl-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (20949, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa


In [7]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            # https://huggingface.co/BAAI/bge-reranker-v2-m3/blob/main/tokenizer_config.json#:~:text=%22model_max_length%22%3A%208192%2C
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/2095 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [8]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.7861473101341353
ROC-AUC: 0.7861371013939278

Classification Report:
              precision    recall  f1-score   support

     class 0    0.70043   1.00000   0.82383     10475
     class 1    1.00000   0.57227   0.72796     10474

    accuracy                        0.78615     20949
   macro avg    0.85022   0.78614   0.77589     20949
weighted avg    0.85021   0.78615   0.77590     20949


Confusion Matrix:
[[10475     0]
 [ 4480  5994]]


In [9]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa,0
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa,0
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa,0


In [10]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 2.0 Facebook

In [11]:
f = "test_datasets/test-reformatted-facebook.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook


In [12]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [13]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9551333333333333
ROC-AUC: 0.9515461351179084

Classification Report:
              precision    recall  f1-score   support

     class 0    0.92388   0.99888   0.95992     24205
     class 1    0.99857   0.90421   0.94905     20795

    accuracy                        0.95513     45000
   macro avg    0.96122   0.95155   0.95448     45000
weighted avg    0.95839   0.95513   0.95490     45000


Confusion Matrix:
[[24178    27]
 [ 1992 18803]]


In [14]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook,0
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook,0
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook,0


In [15]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-facebook.jsonl


#### 3.0 Hansard QA

In [16]:
f = "test_datasets/test-reformatted-hansard-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (12712, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa


In [17]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/1272 [00:00<?, ?it/s]

In [18]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9789962240402769
ROC-AUC: 0.978996224040277

Classification Report:
              precision    recall  f1-score   support

     class 0    0.95969   1.00000   0.97943      6356
     class 1    1.00000   0.95799   0.97855      6356

    accuracy                        0.97900     12712
   macro avg    0.97984   0.97900   0.97899     12712
weighted avg    0.97984   0.97900   0.97899     12712


Confusion Matrix:
[[6356    0]
 [ 267 6089]]


In [19]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa,1
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa,0
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa,1


In [20]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-hansard-qa.jsonl


#### 4.0 IIUM Confession

In [21]:
f = "test_datasets/test-reformatted-iium-confession.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession


In [22]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [23]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9987555555555555
ROC-AUC: 0.9988962618752713

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99708   0.99936   0.99822     15743
     class 1    0.99966   0.99843   0.99904     29257

    accuracy                        0.99876     45000
   macro avg    0.99837   0.99890   0.99863     45000
weighted avg    0.99876   0.99876   0.99876     45000


Confusion Matrix:
[[15733    10]
 [   46 29211]]


In [24]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession,0
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession,1
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession,1


In [25]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-iium-confession.jsonl


#### 5.0 B Cari

In [26]:
f = "test_datasets/test-reformatted-mining-b-cari-com-my.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44910, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my


In [27]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4491 [00:00<?, ?it/s]

In [28]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9515920730349589
ROC-AUC: 0.9548284894989513

Classification Report:
              precision    recall  f1-score   support

     class 0    0.90848   0.99491   0.94973     20642
     class 1    0.99529   0.91474   0.95332     24268

    accuracy                        0.95159     44910
   macro avg    0.95188   0.95483   0.95153     44910
weighted avg    0.95539   0.95159   0.95167     44910


Confusion Matrix:
[[20537   105]
 [ 2069 22199]]


In [29]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my,1
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my,1
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my,0


In [30]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-mining-b-cari-com-my.jsonl


#### 6.0 Summarization

In [13]:
f = "test_datasets/test-reformatted-mining-summarization.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44123, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization


In [14]:
# initialize a list to store all the scores
scores = []
batch_size = 5

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/8825 [00:00<?, ?it/s]

In [15]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9827754232486459
ROC-AUC: 0.971730851293664

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99075   0.98847   0.98961     36614
     class 1    0.94442   0.95499   0.94968      7509

    accuracy                        0.98278     44123
   macro avg    0.96758   0.97173   0.96964     44123
weighted avg    0.98286   0.98278   0.98281     44123


Confusion Matrix:
[[36192   422]
 [  338  7171]]


In [16]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization,1
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization,1
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization,1


In [17]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-mining-summarization.jsonl


#### 7.0 News

In [5]:
f = "test_datasets/test-reformatted-news.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news


In [6]:
# initialize a list to store all the scores
scores = []
batch_size = 5

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/9000 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9872888888888889
ROC-AUC: 0.9727067713541389

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99011   0.99466   0.99238     37446
     class 1    0.97291   0.95075   0.96170      7554

    accuracy                        0.98729     45000
   macro avg    0.98151   0.97271   0.97704     45000
weighted avg    0.98722   0.98729   0.98723     45000


Confusion Matrix:
[[37246   200]
 [  372  7182]]


In [8]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news,0
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news,0
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news,0


In [9]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-news.jsonl


#### 8.0 Twitter

In [8]:
f = "test_datasets/test-reformatted-twitter.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter


In [9]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [10]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9542666666666667
ROC-AUC: 0.9604078491727588

Classification Report:
              precision    recall  f1-score   support

     class 0    0.90232   1.00000   0.94865     19010
     class 1    1.00000   0.92082   0.95878     25990

    accuracy                        0.95427     45000
   macro avg    0.95116   0.96041   0.95371     45000
weighted avg    0.95873   0.95427   0.95450     45000


Confusion Matrix:
[[19010     0]
 [ 2058 23932]]


In [11]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter,0
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter,0
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter,1


In [12]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-twitter.jsonl


#### 9.0 Wikipedia QA

In [6]:
f = "test_datasets/test-reformatted-wikipedia-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (32903, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa


In [9]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=8192, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/3291 [00:00<?, ?it/s]

In [10]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9920675926207336
ROC-AUC: 0.9920673515287824

Classification Report:
              precision    recall  f1-score   support

     class 0    0.98438   1.00000   0.99213     16452
     class 1    1.00000   0.98413   0.99200     16451

    accuracy                        0.99207     32903
   macro avg    0.99219   0.99207   0.99207     32903
weighted avg    0.99219   0.99207   0.99207     32903


Confusion Matrix:
[[16452     0]
 [  261 16190]]


In [11]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa,0
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa,1
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa,0


In [12]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-m3-inferenced-wikipedia-qa.jsonl
