In [1]:
import glob
import json
import torch
import flash_attn
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, confusion_matrix

model_used = '191M'

In [2]:
# load reranker model
tokenizer = AutoTokenizer.from_pretrained("mesolitica/reranker-malaysian-mistral-191M-32k")
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    "mesolitica/reranker-malaysian-mistral-191M-32k", 
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16,
    use_cache = False
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [3]:
# push model to the GPU!
_ = reranker_model.cuda().eval()

In [4]:
!nvidia-smi

Fri Apr 26 15:28:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   47C    P0              68W / 300W |   6139MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

### Inference & Evaluation

#### 1.0 Common Crawl QA

In [7]:
f = "test_datasets/test-reformatted-common-crawl-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (20949, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa


In [8]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/2095 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9948446226550194
ROC-AUC: 0.9948445360554125

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99306   0.99666   0.99485     10475
     class 1    0.99665   0.99303   0.99484     10474

    accuracy                        0.99484     20949
   macro avg    0.99485   0.99484   0.99484     20949
weighted avg    0.99485   0.99484   0.99484     20949


Confusion Matrix:
[[10440    35]
 [   73 10401]]


In [10]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa,0
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa,0
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa,0


In [11]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 2.0 Facebook

In [12]:
f = "test_datasets/test-reformatted-facebook.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook


In [13]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

In [14]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9878
ROC-AUC: 0.9880259330131705

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99222   0.98504   0.98862     24205
     class 1    0.98274   0.99101   0.98686     20795

    accuracy                        0.98780     45000
   macro avg    0.98748   0.98803   0.98774     45000
weighted avg    0.98784   0.98780   0.98780     45000


Confusion Matrix:
[[23843   362]
 [  187 20608]]


In [15]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook,0
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook,0
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook,0


In [16]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 3.0 Hansard QA

In [17]:
f = "test_datasets/test-reformatted-hansard-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (12712, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa


In [18]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/1272 [00:00<?, ?it/s]

In [19]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.99441472624292
ROC-AUC: 0.99441472624292

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99558   0.99323   0.99441      6356
     class 1    0.99325   0.99559   0.99442      6356

    accuracy                        0.99441     12712
   macro avg    0.99442   0.99441   0.99441     12712
weighted avg    0.99442   0.99441   0.99441     12712


Confusion Matrix:
[[6313   43]
 [  28 6328]]


In [20]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa,1
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa,0
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa,1


In [21]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 4.0 IIUM Confession

In [22]:
f = "test_datasets/test-reformatted-iium-confession.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession


In [23]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

In [24]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9990444444444444
ROC-AUC: 0.9988837074061083

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99892   0.99835   0.99863     15743
     class 1    0.99911   0.99942   0.99927     29257

    accuracy                        0.99904     45000
   macro avg    0.99902   0.99888   0.99895     45000
weighted avg    0.99904   0.99904   0.99904     45000


Confusion Matrix:
[[15717    26]
 [   17 29240]]


In [25]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession,0
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession,1
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession,1


In [26]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 5.0 B Cari

In [27]:
f = "test_datasets/test-reformatted-mining-b-cari-com-my.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44910, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my


In [28]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4491 [00:00<?, ?it/s]

In [29]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9936539746158984
ROC-AUC: 0.9933933731983294

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99600   0.99017   0.99308     20642
     class 1    0.99168   0.99662   0.99414     24268

    accuracy                        0.99365     44910
   macro avg    0.99384   0.99339   0.99361     44910
weighted avg    0.99367   0.99365   0.99365     44910


Confusion Matrix:
[[20439   203]
 [   82 24186]]


In [30]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my,1
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my,1
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my,0


In [31]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 6.0 Summarization

In [32]:
f = "test_datasets/test-reformatted-mining-summarization.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44123, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization


In [33]:
# initialize a list to store all the scores
scores = []
batch_size = 2

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/22062 [00:00<?, ?it/s]

In [6]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9968950434014007
ROC-AUC: 0.9953237993834212

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99855   0.99771   0.99813     36614
     class 1    0.98886   0.99294   0.99090      7509

    accuracy                        0.99690     44123
   macro avg    0.99371   0.99532   0.99451     44123
weighted avg    0.99690   0.99690   0.99690     44123


Confusion Matrix:
[[36530    84]
 [   53  7456]]


In [7]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization,1
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization,0
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization,1


In [36]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 7.0 News

In [8]:
f = "test_datasets/test-reformatted-news.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news


In [9]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9908888888888889
ROC-AUC: 0.9871281948096867

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99625   0.99279   0.99452     37446
     class 1    0.96486   0.98147   0.97309      7554

    accuracy                        0.99089     45000
   macro avg    0.98056   0.98713   0.98380     45000
weighted avg    0.99098   0.99089   0.99092     45000


Confusion Matrix:
[[37176   270]
 [  140  7414]]


In [11]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news,0
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news,0
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news,0


In [12]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 8.0 Twitter

In [13]:
f = "test_datasets/test-reformatted-twitter.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter


In [14]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/4500 [00:00<?, ?it/s]

In [15]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9959333333333333
ROC-AUC: 0.9956246980437384

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99673   0.99363   0.99518     19010
     class 1    0.99535   0.99761   0.99648     25990

    accuracy                        0.99593     45000
   macro avg    0.99604   0.99562   0.99583     45000
weighted avg    0.99594   0.99593   0.99593     45000


Confusion Matrix:
[[18889   121]
 [   62 25928]]


In [16]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter,1
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter,0
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter,1


In [17]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 9.0 Wikipedia QA

In [18]:
f = "test_datasets/test-reformatted-wikipedia-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (32903, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa


In [19]:
# initialize a list to store all the scores
scores = []
batch_size = 10

for n in tqdm(range(0, len(test_dataset), batch_size)):
    batch = []
    with torch.no_grad():
        for q, t in zip(test_dataset['query'][n: n + batch_size], test_dataset['text'][n: n + batch_size]):
            input_ids = tokenizer.encode_plus(q, t)
            input_ids.pop('token_type_ids')
            batch.append(input_ids)
        padded = tokenizer.pad(batch, return_tensors = 'pt').to('cuda')
        scores.extend(reranker_model(**padded).logits.cpu().detach().float().numpy().argmax(axis = 1).tolist())

  0%|          | 0/3291 [00:00<?, ?it/s]

In [20]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9976597878612893
ROC-AUC: 0.997659822035875

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99878   0.99654   0.99766     16452
     class 1    0.99654   0.99878   0.99766     16451

    accuracy                        0.99766     32903
   macro avg    0.99766   0.99766   0.99766     32903
weighted avg    0.99766   0.99766   0.99766     32903


Confusion Matrix:
[[16395    57]
 [   20 16431]]


In [21]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa,0
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa,1
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa,0


In [22]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)