In [1]:
import glob
import json
import torch
import flash_attn
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, confusion_matrix

model_used = 'bge-large'

In [2]:
# load reranker model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-large")
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    "BAAI/bge-reranker-large",
    torch_dtype = torch.bfloat16,
    use_cache = False
)

In [3]:
# push model to the GPU!
_ = reranker_model.cuda().eval()

In [4]:
!nvidia-smi

Sat Apr 20 14:35:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   33C    P0              64W / 300W |   2578MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

### Inference & Evaluation

In [5]:
# list down available files
files = sorted(glob.glob('test_datasets/test-reformatted-*.jsonl'))
files

['test_datasets/test-reformatted-common-crawl-qa.jsonl',
 'test_datasets/test-reformatted-facebook.jsonl',
 'test_datasets/test-reformatted-hansard-qa.jsonl',
 'test_datasets/test-reformatted-iium-confession.jsonl',
 'test_datasets/test-reformatted-mining-b-cari-com-my.jsonl',
 'test_datasets/test-reformatted-mining-summarization.jsonl',
 'test_datasets/test-reformatted-news.jsonl',
 'test_datasets/test-reformatted-twitter.jsonl',
 'test_datasets/test-reformatted-wikipedia-qa.jsonl']

#### 1.0 Common Crawl QA

In [6]:
f = "test_datasets/test-reformatted-common-crawl-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (20949, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa


In [8]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/2095 [00:00<?, ?it/s]

In [9]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.709771349467755
ROC-AUC: 0.7097575266496925

Classification Report:
              precision    recall  f1-score   support

     class 0    0.63285   0.99933   0.77495     10475
     class 1    0.99841   0.42018   0.59145     10474

    accuracy                        0.70977     20949
   macro avg    0.81563   0.70976   0.68320     20949
weighted avg    0.81562   0.70977   0.68320     20949


Confusion Matrix:
[[10468     7]
 [ 6073  4401]]


In [10]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa,0
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa,0
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa,0


In [13]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 2.0 Facebook

In [14]:
f = "test_datasets/test-reformatted-facebook.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook


In [15]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [16]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9667777777777777
ROC-AUC: 0.9643146852302846

Classification Report:
              precision    recall  f1-score   support

     class 0    0.94449   0.99682   0.96995     24205
     class 1    0.99604   0.93181   0.96286     20795

    accuracy                        0.96678     45000
   macro avg    0.97027   0.96431   0.96640     45000
weighted avg    0.96831   0.96678   0.96667     45000


Confusion Matrix:
[[24128    77]
 [ 1418 19377]]


In [17]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook,0
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook,0
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook,0


In [19]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-facebook.jsonl


#### 3.0 Hansard QA

In [20]:
f = "test_datasets/test-reformatted-hansard-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (12712, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa


In [21]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/1272 [00:00<?, ?it/s]

In [22]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.8254405286343612
ROC-AUC: 0.8254405286343613

Classification Report:
              precision    recall  f1-score   support

     class 0    0.74128   0.99984   0.85136      6356
     class 1    0.99976   0.65104   0.78857      6356

    accuracy                        0.82544     12712
   macro avg    0.87052   0.82544   0.81996     12712
weighted avg    0.87052   0.82544   0.81996     12712


Confusion Matrix:
[[6355    1]
 [2218 4138]]


In [23]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa,1
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa,0
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa,0


In [24]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-hansard-qa.jsonl


#### 4.0 IIUM Confession

In [25]:
f = "test_datasets/test-reformatted-iium-confession.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession


In [26]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [27]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9958444444444444
ROC-AUC: 0.9963640770090726

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99011   0.99809   0.99408     15743
     class 1    0.99897   0.99463   0.99680     29257

    accuracy                        0.99584     45000
   macro avg    0.99454   0.99636   0.99544     45000
weighted avg    0.99587   0.99584   0.99585     45000


Confusion Matrix:
[[15713    30]
 [  157 29100]]


In [28]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession,0
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession,1
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession,1


In [29]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-iium-confession.jsonl


#### 5.0 B Cari

In [30]:
f = "test_datasets/test-reformatted-mining-b-cari-com-my.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44910, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my


In [31]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4491 [00:00<?, ?it/s]

In [32]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9723669561344912
ROC-AUC: 0.9742322941729378

Classification Report:
              precision    recall  f1-score   support

     class 0    0.94553   0.99734   0.97074     20642
     class 1    0.99762   0.95113   0.97382     24268

    accuracy                        0.97237     44910
   macro avg    0.97158   0.97423   0.97228     44910
weighted avg    0.97368   0.97237   0.97241     44910


Confusion Matrix:
[[20587    55]
 [ 1186 23082]]


In [33]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my,1
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my,1
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my,0


In [34]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-mining-b-cari-com-my.jsonl


#### 6.0 Summarization

In [35]:
f = "test_datasets/test-reformatted-mining-summarization.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44123, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization


In [36]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4413 [00:00<?, ?it/s]

In [37]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9526550778505541
ROC-AUC: 0.9305571647265575

Classification Report:
              precision    recall  f1-score   support

     class 0    0.97857   0.96406   0.97126     36614
     class 1    0.83656   0.89706   0.86575      7509

    accuracy                        0.95266     44123
   macro avg    0.90757   0.93056   0.91851     44123
weighted avg    0.95440   0.95266   0.95330     44123


Confusion Matrix:
[[35298  1316]
 [  773  6736]]


In [38]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization,1
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization,0
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization,1


In [39]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-mining-summarization.jsonl


#### 7.0 News

In [40]:
f = "test_datasets/test-reformatted-news.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news


In [41]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [42]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9737777777777777
ROC-AUC: 0.9414455730734652

Classification Report:
              precision    recall  f1-score   support

     class 0    0.97862   0.99012   0.98434     37446
     class 1    0.94799   0.89277   0.91955      7554

    accuracy                        0.97378     45000
   macro avg    0.96330   0.94145   0.95194     45000
weighted avg    0.97348   0.97378   0.97346     45000


Confusion Matrix:
[[37076   370]
 [  810  6744]]


In [43]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news,0
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news,0
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news,0


In [44]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-news.jsonl


#### 8.0 Twitter

In [45]:
f = "test_datasets/test-reformatted-twitter.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter


In [46]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]

In [47]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9765333333333334
ROC-AUC: 0.9796844940361678

Classification Report:
              precision    recall  f1-score   support

     class 0    0.94737   1.00000   0.97298     19010
     class 1    1.00000   0.95937   0.97926     25990

    accuracy                        0.97653     45000
   macro avg    0.97369   0.97968   0.97612     45000
weighted avg    0.97777   0.97653   0.97661     45000


Confusion Matrix:
[[19010     0]
 [ 1056 24934]]


In [48]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter,0
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter,0
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter,1


In [49]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-twitter.jsonl


#### 9.0 Wikipedia QA

In [50]:
f = "test_datasets/test-reformatted-wikipedia-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (32903, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa


In [51]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t, max_length=512, truncation=True)
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        output = reranker_model(**padded).logits.view(-1, ).cpu().detach().float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/3291 [00:00<?, ?it/s]

In [52]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.8710148010819682
ROC-AUC: 0.8710109122031797

Classification Report:
              precision    recall  f1-score   support

     class 0    0.79542   0.99897   0.88565     16452
     class 1    0.99861   0.74306   0.85208     16451

    accuracy                        0.87101     32903
   macro avg    0.89702   0.87101   0.86887     32903
weighted avg    0.89701   0.87101   0.86887     32903


Confusion Matrix:
[[16435    17]
 [ 4227 12224]]


In [53]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa,0
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa,1
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa,0


In [54]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-large-inferenced-wikipedia-qa.jsonl
