In [2]:
import glob
import json
import torch
import flash_attn
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, confusion_matrix

model_used = '474M'

In [2]:
# load reranker model
tokenizer = AutoTokenizer.from_pretrained("mesolitica/reranker-malaysian-mistral-474M-32k")
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    "mesolitica/reranker-malaysian-mistral-64M-32k",
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16,
    use_cache = False
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [3]:
# push model to the GPU!
_ = reranker_model.cuda()

In [8]:
!nvidia-smi

Sun Apr 14 11:53:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   35C    P0              64W / 300W |   3745MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Inference & Evaluation

#### 1.0 Common Crawl QA

In [9]:
f = "test_datasets/test-reformatted-common-crawl-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (20949, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa


In [10]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/2095 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [4]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9942718029500215
ROC-AUC: 0.9942718302805037

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99484   0.99370   0.99427     10475
     class 1    0.99371   0.99484   0.99427     10474

    accuracy                        0.99427     20949
   macro avg    0.99427   0.99427   0.99427     20949
weighted avg    0.99427   0.99427   0.99427     20949


Confusion Matrix:
[[10409    66]
 [   54 10420]]


In [12]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa,0
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa,0
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa,0


In [13]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 2.0 Facebook

In [14]:
f = "test_datasets/test-reformatted-facebook.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook


In [15]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

In [6]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9860444444444444
ROC-AUC: 0.9865396949267048

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99397   0.98000   0.98694     24205
     class 1    0.97710   0.99308   0.98502     20795

    accuracy                        0.98604     45000
   macro avg    0.98553   0.98654   0.98598     45000
weighted avg    0.98617   0.98604   0.98605     45000


Confusion Matrix:
[[23721   484]
 [  144 20651]]


In [17]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook,0
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook,0
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook,0


In [18]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 3.0 Hansard QA

In [19]:
f = "test_datasets/test-reformatted-hansard-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (12712, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa


In [20]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/1272 [00:00<?, ?it/s]

In [8]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9938640654499685
ROC-AUC: 0.9938640654499684

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99746   0.99025   0.99384      6356
     class 1    0.99032   0.99748   0.99389      6356

    accuracy                        0.99386     12712
   macro avg    0.99389   0.99386   0.99386     12712
weighted avg    0.99389   0.99386   0.99386     12712


Confusion Matrix:
[[6294   62]
 [  16 6340]]


In [22]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa,1
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa,0
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa,1


In [23]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 4.0 IIUM Confession

In [24]:
f = "test_datasets/test-reformatted-iium-confession.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession


In [25]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

In [10]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9984
ROC-AUC: 0.9981973866551858

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99790   0.99752   0.99771     15743
     class 1    0.99867   0.99887   0.99877     29257

    accuracy                        0.99840     45000
   macro avg    0.99829   0.99820   0.99824     45000
weighted avg    0.99840   0.99840   0.99840     45000


Confusion Matrix:
[[15704    39]
 [   33 29224]]


In [27]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession,0
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession,1
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession,1


In [28]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 5.0 B Cari

In [29]:
f = "test_datasets/test-reformatted-mining-b-cari-com-my.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44910, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my


In [30]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4491 [00:00<?, ?it/s]

In [12]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9915163660654642
ROC-AUC: 0.9910426827596315

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99633   0.98518   0.99072     20642
     class 1    0.98751   0.99691   0.99219     24268

    accuracy                        0.99152     44910
   macro avg    0.99192   0.99104   0.99145     44910
weighted avg    0.99156   0.99152   0.99151     44910


Confusion Matrix:
[[20336   306]
 [   75 24193]]


In [32]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my,1
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my,1
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my,0


In [33]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 6.0 Summarization

In [4]:
f = "test_datasets/test-reformatted-mining-summarization.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44123, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization


In [5]:
# initialize a list to store all the scores
scores = []
batch_size = 2

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/22062 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9954898805611586
ROC-AUC: 0.9908778354014366

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99670   0.99787   0.99728     36614
     class 1    0.98955   0.98389   0.98671      7509

    accuracy                        0.99549     44123
   macro avg    0.99313   0.99088   0.99200     44123
weighted avg    0.99548   0.99549   0.99548     44123


Confusion Matrix:
[[36536    78]
 [  121  7388]]


In [7]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization,1
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization,0
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization,1


In [8]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 7.0 News

In [44]:
f = "test_datasets/test-reformatted-news.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news


In [45]:
# initialize a list to store all the scores
scores = []
batch_size = 5

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/9000 [00:00<?, ?it/s]

In [16]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9903777777777778
ROC-AUC: 0.9859228477782166

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99579   0.99263   0.99421     37446
     class 1    0.96403   0.97922   0.97156      7554

    accuracy                        0.99038     45000
   macro avg    0.97991   0.98592   0.98289     45000
weighted avg    0.99046   0.99038   0.99041     45000


Confusion Matrix:
[[37170   276]
 [  157  7397]]


In [47]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news,0
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news,0
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news,0


In [48]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 8.0 Twitter

In [34]:
f = "test_datasets/test-reformatted-twitter.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter


In [35]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

In [18]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9949333333333333
ROC-AUC: 0.9944199191248039

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99688   0.99111   0.99399     19010
     class 1    0.99352   0.99773   0.99562     25990

    accuracy                        0.99493     45000
   macro avg    0.99520   0.99442   0.99480     45000
weighted avg    0.99494   0.99493   0.99493     45000


Confusion Matrix:
[[18841   169]
 [   59 25931]]


In [37]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter,1
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter,0
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter,1


In [38]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 9.0 Wikipedia QA

In [39]:
f = "test_datasets/test-reformatted-wikipedia-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (32903, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa


In [40]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
        input_ids = tokenizer.encode_plus(q, t)
        input_ids.pop('token_type_ids')
        batch.append(input_ids)
    padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
    scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/3291 [00:00<?, ?it/s]

In [42]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa,0
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa,1
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa,0


In [43]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

In [19]:
f = "test_datasets/test-reformatted-wikipedia-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]
test_dataset = pd.read_json(f"inferenced_output/{model_used}-inferenced-{f_name}.jsonl", lines=True, orient='records')
scores = test_dataset["predicted_label"]

test_dataset["label"].value_counts()

label
0    16452
1    16451
Name: count, dtype: int64

In [20]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9956842841078322
ROC-AUC: 0.9956843450677736

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99768   0.99368   0.99568     16452
     class 1    0.99370   0.99769   0.99569     16451

    accuracy                        0.99568     32903
   macro avg    0.99569   0.99568   0.99568     32903
weighted avg    0.99569   0.99568   0.99568     32903


Confusion Matrix:
[[16348   104]
 [   38 16413]]
