In [1]:
import os
import pandas as pd
import pprint

from pathlib import Path
from transformers import pipeline

from src import sentiment, helper, utility
from src.config import MBertConfig, SharedConfig
from src.metrics import evaluate_pipe

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if __name__ == "__main__":
    # helper.jsonToCSV("miko.json", "miko.csv")
    
    jsonl = helper.read_jsonl_as_string(Path("miko.jsonl"))
    _, _, test_ds, _, _ = utility.load_split_dataset(jsonl)  # load again, un-mapped

    texts = helper.to_list_str(test_ds[SharedConfig.TEXT_COL])
    labels = list(test_ds[SharedConfig.LABEL_COL])
    
    mBertTrainer = sentiment.train(MBertConfig, require_translation = False)
    mBertPipe = pipeline(
        "text-classification", 
        model=mBertTrainer.model, 
        tokenizer=mBertTrainer.tokenizer, 
        return_all_scores=True,
        device=0 if SharedConfig.USE_FP16 else -1
    )
    metrics = evaluate_pipe(mBertPipe, texts, labels, id2label=mBertTrainer.model.config.id2label)
    print("\r\nMetrics:")
    pprint.pprint(metrics)
    
    #sample_texts = [
    #    "Maganda ang serbisyo at mabilis ang delivery!",  # Tagalog positive
    #    "Sobrang pangit ng karanasan ko.",                # Tagalog negative
    #    "It was okay, nothing special.",                  # English neutral-ish
    #]
    #sentiment.infer(sample_texts, trainer.tokenizer, trainer.model)


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|████████████████████████████████████████████████████████████████████████████| 69/69 [00:00<00:00, 19077.59 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4545.69 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 9054.32 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,No log,1.095798,0.2,0.121212,0.083333,0.222222



Test metrics:
{'epoch': 1.0,
 'eval_accuracy': 0.47619047619047616,
 'eval_f1_macro': 0.4423280423280423,
 'eval_loss': 1.0944788455963135,
 'eval_precision_macro': 0.4351851851851852,
 'eval_recall_macro': 0.46296296296296297,
 'eval_runtime': 0.1238,
 'eval_samples_per_second': 169.675,
 'eval_steps_per_second': 8.08}

Metrics:
{'accuracy': 0.47619047619047616,
 'f1_macro': 0.4423280423280423,
 'report': '              precision    recall  f1-score   support\n'
           '\n'
           '           0      0.250     0.167     0.200         6\n'
           '           1      0.500     0.667     0.571         6\n'
           '           2      0.556     0.556     0.556         9\n'
           '\n'
           '    accuracy                          0.476        21\n'
           '   macro avg      0.435     0.463     0.442        21\n'
           'weighted avg      0.452     0.476     0.459        21\n'}


