In [1]:
import glob
import json
import torch
import flash_attn
import pandas as pd
from typing import Tuple
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, confusion_matrix

model_used = 'bge-v2-gemma'

In [2]:
# Initialize a tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-gemma")

In [3]:
# load reranker model
reranker_model = AutoModelForCausalLM.from_pretrained(
    "BAAI/bge-reranker-v2-gemma",
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16,
    use_cache = False
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# push model to the GPU!
_ = reranker_model.cuda().eval()

In [5]:
!nvidia-smi

Sat Apr 27 06:58:37 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   31C    P0              62W / 300W |   5284MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
yes_loc = tokenizer('Yes', add_special_tokens=False)['input_ids'][0]

def get_inputs(
    df: pd.DataFrame,
    tokenizer,
    n: int,
    prompt: str = None,
    batch_size: int = 10,
    max_length: int = 1024
):
    """
    This is a utility function which gets the inputs ids for the prompt, separator, and query and passage pair. It was
    modified slightly for our use case from the below link.
    
    Unlike the other Reranker models in our benchmarking exercise, `BAAI/bge-reranker-v2-gemma` is an LLM-based
    Reranker. Hence, its inference method is slightly different. More can be found here:
    https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/llm_reranker#for-llm-based-reranker-1
    """
    df_ = df.copy()

    if prompt is None:
        prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."

    sep = "\n"

    prompt_inputs = tokenizer(prompt,
                              return_tensors=None,
                              add_special_tokens=False)['input_ids']

    sep_inputs = tokenizer(sep,
                           return_tensors=None,
                           add_special_tokens=False)['input_ids']
    
    batch = []
    for q, t in zip(df_['query'][n: n + batch_size], df_['text'][n: n + batch_size]):

        query_inputs = tokenizer(f'A: {q}',
                                 return_tensors=None,
                                 add_special_tokens=False,
                                 max_length=max_length * 3 // 4,
                                 truncation=True)

        passage_inputs = tokenizer(f'B: {t}',
                                   return_tensors=None,
                                   add_special_tokens=False,
                                   max_length=max_length,
                                   truncation=True)

        item = tokenizer.prepare_for_model(
            [tokenizer.bos_token_id] + query_inputs['input_ids'],
            sep_inputs + passage_inputs['input_ids'],
            truncation='only_second',
            max_length=max_length,
            padding=False,
            return_attention_mask=False,
            return_token_type_ids=False,
            add_special_tokens=False
        )

        item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
        item['attention_mask'] = [1] * len(item['input_ids'])
        batch.append(item)

    return tokenizer.pad(
                batch,
                padding=True,
                max_length=max_length + len(sep_inputs) + len(prompt_inputs),
                pad_to_multiple_of=8,
                return_tensors='pt',
        ).to('cuda')

### Inference & Evaluation

In [12]:
# list down available files
files = sorted(glob.glob('test_datasets/test-reformatted-*.jsonl'))
files

['test_datasets/test-reformatted-common-crawl-qa.jsonl',
 'test_datasets/test-reformatted-facebook.jsonl',
 'test_datasets/test-reformatted-hansard-qa.jsonl',
 'test_datasets/test-reformatted-iium-confession.jsonl',
 'test_datasets/test-reformatted-mining-b-cari-com-my.jsonl',
 'test_datasets/test-reformatted-mining-summarization.jsonl',
 'test_datasets/test-reformatted-news.jsonl',
 'test_datasets/test-reformatted-twitter.jsonl',
 'test_datasets/test-reformatted-wikipedia-qa.jsonl']

#### 1.0 Common Crawl QA

In [13]:
f = "test_datasets/test-reformatted-common-crawl-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (20949, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa


In [14]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/2095 [00:00<?, ?it/s]



In [15]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9313093703756743
ROC-AUC: 0.9313116283393861

Classification Report:
              precision    recall  f1-score   support

     class 0    0.97638   0.88401   0.92790     10475
     class 1    0.89403   0.97861   0.93441     10474

    accuracy                        0.93131     20949
   macro avg    0.93520   0.93131   0.93116     20949
weighted avg    0.93521   0.93131   0.93116     20949


Confusion Matrix:
[[ 9260  1215]
 [  224 10250]]


In [16]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Bagaimana cara penghantaran dan kos penghantar...,Ladam merupakan barang buatan yang lazimnya di...,0,test,common-crawl-qa,0
1,Apakah jenama bateri kereta yang disediakan ol...,Komik adalah sejenis seni visual yang menggabu...,0,test,common-crawl-qa,0
2,Apakah kawasan liputan untuk servis hantar bat...,"untuk memecat Lawton, meninggalkan sekali lagi...",0,test,common-crawl-qa,0


In [17]:
inferenced_df.to_json(
    f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl',
    orient='records',
    lines=True
)

#### 2.0 Facebook

In [18]:
f = "test_datasets/test-reformatted-facebook.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook


In [19]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]



In [20]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.8295555555555556
ROC-AUC: 0.840840154767234

Classification Report:
              precision    recall  f1-score   support

     class 0    0.98744   0.69192   0.81368     24205
     class 1    0.73405   0.98976   0.84294     20795

    accuracy                        0.82956     45000
   macro avg    0.86075   0.84084   0.82831     45000
weighted avg    0.87035   0.82956   0.82720     45000


Confusion Matrix:
[[16748  7457]
 [  213 20582]]


In [21]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sekapur sirih road dah,Abi Ahmad Furqan memang Barisan Nasional dah l...,0,test,facebook,1
1,Menangkan calon pakatan harapan di segi besar,Ismail sabri dan muhiadin,0,test,facebook,1
2,Norlelawati Sukiman amin pra,25k terbaik,0,test,facebook,0


In [22]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-facebook.jsonl


#### 3.0 Hansard QA

In [23]:
f = "test_datasets/test-reformatted-hansard-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (12712, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa


In [24]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/1272 [00:00<?, ?it/s]



In [25]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.9201541850220264
ROC-AUC: 0.9201541850220265

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99981   0.84047   0.91324      6356
     class 1    0.86240   0.99984   0.92605      6356

    accuracy                        0.92015     12712
   macro avg    0.93110   0.92015   0.91964     12712
weighted avg    0.93110   0.92015   0.91964     12712


Confusion Matrix:
[[5342 1014]
 [   1 6355]]


In [26]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Siapakah wakil rakyat untuk kawasan Kota Belud?,\nii DR.07.12.2021 \n\n \n58. YB. Dr. Kelvin Y...,1,test,hansard-qa,1
1,Berapa jumlah unit yang telah dibina oleh PR1M...,from this seller User Review Submit A Review O...,0,test,hansard-qa,0
2,Apakah langkah kerajaan yang hendak dikenakan ...,\n8 ...,1,test,hansard-qa,1


In [27]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-hansard-qa.jsonl


#### 4.0 IIUM Confession

In [28]:
f = "test_datasets/test-reformatted-iium-confession.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession


In [29]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]



In [30]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.8511111111111112
ROC-AUC: 0.787236353080879

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99978   0.57454   0.72973     15743
     class 1    0.81370   0.99993   0.89726     29257

    accuracy                        0.85111     45000
   macro avg    0.90674   0.78724   0.81349     45000
weighted avg    0.87880   0.85111   0.83865     45000


Confusion Matrix:
[[ 9045  6698]
 [    2 29255]]


In [31]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Means, dia berkurung dalam bilik, tarik muka d...","""usia 20 an macam ni,memang kita perlukan sora...",0,test,iium-confession,0
1,Saya selalu follow seorang perunding imej feme...,Aku sentiasa ikut seorang consultant image yan...,1,test,iium-confession,1
2,Dan aku juga kasihan akan diriku sendiri. Di r...,And I also pity myself. In that house I was st...,1,test,iium-confession,1


In [32]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-iium-confession.jsonl


#### 5.0 B Cari

In [33]:
f = "test_datasets/test-reformatted-mining-b-cari-com-my.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44910, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my


In [34]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4491 [00:00<?, ?it/s]



In [35]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.7222444889779559
ROC-AUC: 0.6983159218555284

Classification Report:
              precision    recall  f1-score   support

     class 0    0.98469   0.40195   0.57087     20642
     class 1    0.66163   0.99468   0.79467     24268

    accuracy                        0.72224     44910
   macro avg    0.82316   0.69832   0.68277     44910
weighted avg    0.81012   0.72224   0.69181     44910


Confusion Matrix:
[[ 8297 12345]
 [  129 24139]]


In [36]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Hello. I am a student. Every time I need to wr...,Hello. Im a student. If I need to write an ess...,1,test,mining-b-cari-com-my,1
1,betul ke boleh tapis? dan filter dalam tu mest...,Is it true that it can be filtered? And the fi...,1,test,mining-b-cari-com-my,1
2,Jangan dia main campak masuk tong sampah bila ...,mmmeeeeowwwwwwwwwwwwwwww,0,test,mining-b-cari-com-my,0


In [37]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-mining-b-cari-com-my.jsonl


#### 6.0 Summarization

In [53]:
f = "test_datasets/test-reformatted-mining-summarization.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (44123, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization


In [54]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4413 [00:00<?, ?it/s]



In [55]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.630963443102237
ROC-AUC: 0.7773750467411285

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99975   0.55542   0.71411     36614
     class 1    0.31553   0.99933   0.47963      7509

    accuracy                        0.63096     44123
   macro avg    0.65764   0.77738   0.59687     44123
weighted avg    0.88331   0.63096   0.67420     44123


Confusion Matrix:
[[20336 16278]
 [    5  7504]]


In [56]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Sektor pertanian dijangka pulih pada 2019 deng...,KUALA LUMPUR: Sektor pertanian dijangka pulih ...,1,test,mining-summarization,1
1,"Malaysia Today, a Malaysian news site, has bee...","Tolong lah gais, aku nasihatkan tolong kunci p...",0,test,mining-summarization,1
2,"In the Malaysian General Election (GE15), no p...",PUTRAJAYA: Tiada mana-mana parti memperoleh 50...,1,test,mining-summarization,1


In [57]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-mining-summarization.jsonl


#### 7.0 News

In [48]:
f = "test_datasets/test-reformatted-news.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news


In [49]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]



In [50]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.8775333333333334
ROC-AUC: 0.9240363470498392

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99859   0.85403   0.92067     37446
     class 1    0.57873   0.99404   0.73155      7554

    accuracy                        0.87753     45000
   macro avg    0.78866   0.92404   0.82611     45000
weighted avg    0.92811   0.87753   0.88892     45000


Confusion Matrix:
[[31980  5466]
 [   45  7509]]


In [51]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,"Bridgerton: sex, gossip and race in Regency Lo...",THE appointment of Najib Razak as chairman of ...,0,test,news,0
1,PAS minta putus dengan PKR kerana Selena Gomez,\n(Focus Malaysia) – Legal practitioner Khairu...,0,test,news,0
2,Harga MSM dijangka terus meningkat,"LONDON, Feb 9 — Lewis Hamilton has signed a o...",0,test,news,0


In [52]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-news.jsonl


#### 8.0 Twitter

In [38]:
f = "test_datasets/test-reformatted-twitter.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (45000, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter


In [39]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/4500 [00:00<?, ?it/s]



In [40]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.8801333333333333
ROC-AUC: 0.8588266154242548

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99283   0.72146   0.83567     19010
     class 1    0.83021   0.99619   0.90566     25990

    accuracy                        0.88013     45000
   macro avg    0.91152   0.85883   0.87066     45000
weighted avg    0.89891   0.88013   0.87609     45000


Confusion Matrix:
[[13715  5295]
 [   99 25891]]


In [41]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,@nickgle_ Sugarbun tapi nadai manuk tadi.,@nickgle_ Sugarbun tapi xda ayam td,1,test,twitter,1
1,Semua laki laki sama aja.\nTapi kalo kamu sama...,@hardliffee Jngan terlalu larut tidurnya yaa,0,test,twitter,1
2,11:16am PLUS@E1 : Kerja penyelenggaraan KM 23...,Maintenance work KM 231.0 - KM 233.0 South fro...,1,test,twitter,1


In [42]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-twitter.jsonl


#### 9.0 Wikipedia QA

In [43]:
f = "test_datasets/test-reformatted-wikipedia-qa.jsonl"
f_name = f.split('reformatted-')[-1].split('.')[0]

test_dataset = pd.read_json(f, lines=True)
test_dataset['dataset_source'] = f_name

print('Shape:', test_dataset.shape)
test_dataset.head(3)

Shape: (32903, 5)


Unnamed: 0,query,text,label,split,dataset_source
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa


In [44]:
# initialize a list to store all the scores
scores = []
max_length = 1024

for n in tqdm(range(0, len(test_dataset), batch_size)):
    with torch.no_grad():
        padded = get_inputs(test_dataset, tokenizer, n)
        output = reranker_model(**padded, return_dict=True).logits[:, -1, yes_loc] \
                                                           .view(-1, ).cpu().detach() \
                                                           .float().numpy().tolist()
        scores.extend([1 if score > 0 else 0 for score in output])

  0%|          | 0/3291 [00:00<?, ?it/s]



In [45]:
# Evaluate predictions (macro)
true_labels = test_dataset["label"]
accuracy = accuracy_score(true_labels, scores)
roc_auc = roc_auc_score(true_labels, scores)
print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:")
print(classification_report(
    true_labels,
    scores,
    target_names=["class 0", "class 1"],
    digits=5
))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, scores))

Accuracy: 0.7811749688478254
ROC-AUC: 0.7811815915451412

Classification Report:
              precision    recall  f1-score   support

     class 0    0.99838   0.56327   0.72021     16452
     class 1    0.69582   0.99909   0.82032     16451

    accuracy                        0.78117     32903
   macro avg    0.84710   0.78118   0.77027     32903
weighted avg    0.84711   0.78117   0.77027     32903


Confusion Matrix:
[[ 9267  7185]
 [   15 16436]]


In [46]:
# save and export inferenced dataframe
inferenced_df = pd.concat([test_dataset, pd.DataFrame(scores, columns=['predicted_label'])], axis=1)
inferenced_df.head(3)

Unnamed: 0,query,text,label,split,dataset_source,predicted_label
0,Apakah penyakit yang diderita oleh Sharielda?,\nDR. 26.7.2022 103 \n\n \nkemungkinan daripad...,0,test,wikipedia-qa,1
1,Siapakah komposer muda yang Chrisye dekati unt...,"Guruh, dan beliau juga akan berjumpa dengannya...",1,test,wikipedia-qa,1
2,Siapakah yang menfailkan petisyen untuk menunt...,\n56 DR. 10.7.2019 \n\n \n\n \n\nTuan Yang d...,0,test,wikipedia-qa,1


In [47]:
file_output = f'inferenced_output/{model_used}-inferenced-{f_name}.jsonl'
print(file_output)

inferenced_df.to_json(
    file_output,
    orient='records',
    lines=True
)

inferenced_output/bge-v2-gemma-inferenced-wikipedia-qa.jsonl
