In [1]:
# !pip install datasets transformers[torch]
# !pip install accelerate -U
# !pip install pandas scikit-learn beautifulsoup4 requests

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import torch
from torch import nn
from transformers import Trainer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score
from transformers import TrainingArguments
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from sinling import SinhalaTokenizer
from sinling import SinhalaStemmer
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "nlpc-uom/SinBERT-small"

In [4]:
def load_stopwords():
    stopwords = []
    with open('../dataset/stop words.txt', 'r', encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
                
            stopwords.append(line.strip())
    
    return stopwords

In [5]:
sinhala_tokenizer = SinhalaTokenizer()
stemmer = SinhalaStemmer()

def preprocess(sentence: str):
    
    tokens = sinhala_tokenizer.tokenize(sentence)
    stemmed = [stemmer.stem(token)[0] for token in tokens]
    
    return ' '.join(stemmed)

In [6]:
dataset = pd.read_csv('dataset/upgraded-sinhala-news-categories.csv')
dataset['comments'] = dataset['comments'].apply(preprocess)
dataset = Dataset.from_pandas(dataset)

dataset = dataset.train_test_split(test_size=0.2)
test_valid = dataset['test'].train_test_split(test_size=0.5)

dataset['validation'] = test_valid['train']
dataset['test'] = test_valid['test']

dataset_df = dataset['train'].to_pandas()
display(dataset_df.head())

dataset

Unnamed: 0,comments,labels
0,ශ්‍ර ලංකා හ බංග්ලාදේශ අතර අ ඩකා පැවත්වෙ ආසියා ...,3
1,එල්.ටී.ටී.ඊ. සංවිධාන ත්‍රස්තවාද ක්‍රියාකාරක ද...,6
2,වසර 6 දොර වැඩකටයුත සඳහ අසතුට කාල වැයකර .,0
3,"ඔ , 2008 වසර පැව පළ අදියර සි අනිද්ද ඇරඹෙ 8 ව අ...",3
4,බෙද ගියේ .,3


DatasetDict({
    train: Dataset({
        features: ['comments', 'labels'],
        num_rows: 5779
    })
    test: Dataset({
        features: ['comments', 'labels'],
        num_rows: 723
    })
    validation: Dataset({
        features: ['comments', 'labels'],
        num_rows: 722
    })
})

In [7]:
features = dataset['train'].features
features

{'comments': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None)}

In [8]:
id2label = {
    0: 'governmental_activities', 
    1: 'business', 
    2: 'technology', 
    3: 'sport', 
    4: 'entertainment', 
    5: 'international', 
    6: 'public_safety_incidents', 
    7: 'local', 
    8: 'health', 
    9: 'religion',
    10: 'weather',
    11: 'education', 
    12: 'security',
    13: 'transport'
}
label2id = {v:k for k,v in id2label.items()}


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [10]:
def tokenize_text(examples):
    return tokenizer(examples['comments'], truncation=True, max_length=512)

dataset = dataset.map(tokenize_text, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████| 5779/5779 [00:00<00:00, 8623.10 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 723/723 [00:00<00:00, 8182.08 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 722/722 [00:00<00:00, 8775.53 examples/s]


In [11]:
class_weights = (1 - (dataset_df['labels'].value_counts().sort_index() / len(dataset_df))).values
class_weights = torch.from_numpy(class_weights).float().to('cuda')
class_weights

tensor([0.8562, 0.8252, 0.8801, 0.7749, 0.9365, 0.8259, 0.9578, 0.9685, 0.9936,
        0.9979, 0.9945, 0.9934, 0.9988, 0.9967], device='cuda:0')

In [12]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Feed inputs to model and extract logists
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Extract labels
        labels = inputs.get("labels")
        # Define loss function with class weights
        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        # Compute loss
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss
    

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=14,
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at nlpc-uom/SinBERT-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

In [14]:
batch_size = 32
# Log the training loss at each epoch
logging_steps = len(dataset["train"]) // batch_size
output_dir = "sinbert-saved"
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  fp16=True, # Make it train fast
                                  push_to_hub=False
                                  )



In [15]:
trainer = WeightedLossTrainer(model=model,
                              args=training_args,
                              compute_metrics=compute_metrics,
                              train_dataset=dataset["train"],
                              eval_dataset=dataset["validation"],
                              tokenizer=tokenizer)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.2185,0.639845,0.804084
2,0.6281,0.492114,0.838144
3,0.4514,0.481754,0.854392
4,0.3521,0.439619,0.864225
5,0.267,0.429452,0.865154
6,0.2091,0.430903,0.876376
7,0.1715,0.443859,0.864795
8,0.1445,0.453036,0.875353
9,0.1175,0.453787,0.86896
10,0.1109,0.458048,0.872169


TrainOutput(global_step=1810, training_loss=0.365644266197036, metrics={'train_runtime': 1111.3278, 'train_samples_per_second': 52.001, 'train_steps_per_second': 1.629, 'total_flos': 7647547903324368.0, 'train_loss': 0.365644266197036, 'epoch': 10.0})