In [None]:
import pandas as pd
import huggingface_hub
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments,
    AutoConfig
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score

In [None]:
df = pd.read_csv('../data/train_dataset.csv')
df_val = pd.read_csv('../data/valid_dataset.csv')

In [None]:
def numerize(df):
    df['label'] = 0
    df.loc[df['genre'] == "giao-duc", "label"] = 1
    df.loc[df['genre'] == "xe", "label"] = 2
    df.loc[df['genre'] == "suc-khoe", "label"] = 3
    df.loc[df['genre'] == "cong-nghe-game", "label"] = 4

numerize(df)
numerize(df_val)

In [None]:
df_train = Dataset.from_pandas(df)
df_val = Dataset.from_pandas(df_val)

dataset = {
    'train': df_train,
    'validation': df_val,
}

In [None]:
# vinai/phobert-base
model_name = "vinai/phobert-base"

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)

In [None]:
def tokenize(examples):
    tokenized_inputs = tokenizer(examples['title'], padding=True, truncation=True )
    return tokenized_inputs

dataset['train'] = dataset['train'].map(tokenize, batched=True, batch_size=None)
dataset['validation'] = dataset['validation'].map(tokenize, batched=True, batch_size=None)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
labels = list(set(df["label"]))
id2label = {k:v for k,v in enumerate(labels)}
label2id = {v:k for k,v in enumerate(labels)}
num_labels = len(labels)

config = (AutoConfig
          .from_pretrained(model_name, num_labels=num_labels,
                           label2id=label2id, id2label=id2label))

# model
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
training_args = TrainingArguments(
                    output_dir="./results",
                    num_train_epochs=10,
                    learning_rate=2e-5,
                    per_device_train_batch_size=16,
                    per_device_eval_batch_size=16,
                    weight_decay=0.01,
                    logging_steps=len(dataset["train"]) // 16,
                    evaluation_strategy='epoch',
                    eval_steps=10,
                    report_to='none'  
            )

trainer = Trainer(
                  model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset["train"],
                  eval_dataset=dataset["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer
            )
trainer.train()

 30%|██▉       | 186/630 [01:46<03:50,  1.93it/s]

{'loss': 0.1786, 'grad_norm': 10.731210708618164, 'learning_rate': 1.4095238095238097e-05, 'epoch': 2.95}


                                                 
 30%|███       | 189/630 [01:48<03:38,  2.02it/s]

{'eval_loss': 0.25506022572517395, 'eval_accuracy': 0.92, 'eval_f1': 0.9225972273340695, 'eval_runtime': 0.5718, 'eval_samples_per_second': 174.871, 'eval_steps_per_second': 12.241, 'epoch': 3.0}


 39%|███▉      | 248/630 [02:18<03:09,  2.01it/s]

{'loss': 0.0987, 'grad_norm': 0.2759944796562195, 'learning_rate': 1.2126984126984127e-05, 'epoch': 3.94}


                                                 
 40%|████      | 252/630 [02:20<02:54,  2.16it/s]

{'eval_loss': 0.30083468556404114, 'eval_accuracy': 0.93, 'eval_f1': 0.9315246753823344, 'eval_runtime': 0.5646, 'eval_samples_per_second': 177.113, 'eval_steps_per_second': 12.398, 'epoch': 4.0}


 49%|████▉     | 310/630 [02:50<02:39,  2.01it/s]

{'loss': 0.06, 'grad_norm': 0.5198515057563782, 'learning_rate': 1.015873015873016e-05, 'epoch': 4.92}


                                                 
 50%|█████     | 315/630 [02:53<02:25,  2.16it/s]

{'eval_loss': 0.3532783091068268, 'eval_accuracy': 0.93, 'eval_f1': 0.9306171314231739, 'eval_runtime': 0.5603, 'eval_samples_per_second': 178.473, 'eval_steps_per_second': 12.493, 'epoch': 5.0}


 59%|█████▉    | 372/630 [03:22<02:15,  1.91it/s]

{'loss': 0.0329, 'grad_norm': 17.85242462158203, 'learning_rate': 8.190476190476192e-06, 'epoch': 5.9}


                                                 
 60%|██████    | 378/630 [03:25<01:56,  2.16it/s]

{'eval_loss': 0.32787489891052246, 'eval_accuracy': 0.94, 'eval_f1': 0.9409094970070581, 'eval_runtime': 0.5754, 'eval_samples_per_second': 173.782, 'eval_steps_per_second': 12.165, 'epoch': 6.0}


 69%|██████▉   | 434/630 [03:53<01:37,  2.01it/s]

{'loss': 0.0424, 'grad_norm': 0.09025076776742935, 'learning_rate': 6.222222222222223e-06, 'epoch': 6.89}


                                                 
 70%|███████   | 441/630 [03:57<01:26,  2.18it/s]

{'eval_loss': 0.373673677444458, 'eval_accuracy': 0.93, 'eval_f1': 0.9320572450805009, 'eval_runtime': 0.5606, 'eval_samples_per_second': 178.367, 'eval_steps_per_second': 12.486, 'epoch': 7.0}


 79%|███████▊  | 496/630 [04:24<01:06,  2.01it/s]

{'loss': 0.0198, 'grad_norm': 0.11602314561605453, 'learning_rate': 4.2539682539682546e-06, 'epoch': 7.87}


                                                 
 80%|████████  | 504/630 [04:31<01:33,  1.35it/s]

{'eval_loss': 0.35336875915527344, 'eval_accuracy': 0.94, 'eval_f1': 0.9409094970070581, 'eval_runtime': 0.6038, 'eval_samples_per_second': 165.605, 'eval_steps_per_second': 11.592, 'epoch': 8.0}


 89%|████████▊ | 558/630 [04:59<00:36,  1.97it/s]

{'loss': 0.0174, 'grad_norm': 0.06966102868318558, 'learning_rate': 2.285714285714286e-06, 'epoch': 8.86}


                                                 
 90%|█████████ | 567/630 [05:04<00:29,  2.14it/s]

{'eval_loss': 0.3715101182460785, 'eval_accuracy': 0.93, 'eval_f1': 0.9320572450805009, 'eval_runtime': 0.6003, 'eval_samples_per_second': 166.587, 'eval_steps_per_second': 11.661, 'epoch': 9.0}


 98%|█████████▊| 620/630 [05:31<00:05,  1.97it/s]

{'loss': 0.0098, 'grad_norm': 0.07069993764162064, 'learning_rate': 3.174603174603175e-07, 'epoch': 9.84}


                                                 
100%|██████████| 630/630 [05:36<00:00,  1.87it/s]

{'eval_loss': 0.374647855758667, 'eval_accuracy': 0.93, 'eval_f1': 0.9320572450805009, 'eval_runtime': 0.588, 'eval_samples_per_second': 170.074, 'eval_steps_per_second': 11.905, 'epoch': 10.0}
{'train_runtime': 336.7096, 'train_samples_per_second': 29.699, 'train_steps_per_second': 1.871, 'train_loss': 0.2042935320782283, 'epoch': 10.0}





TrainOutput(global_step=630, training_loss=0.2042935320782283, metrics={'train_runtime': 336.7096, 'train_samples_per_second': 29.699, 'train_steps_per_second': 1.871, 'train_loss': 0.2042935320782283, 'epoch': 10.0})

In [None]:
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mob2711/phoBERT_finetune_news_classification")
model = AutoModelForSequenceClassification.from_pretrained("mob2711/phoBERT_finetune_news_classification")

def tokenize(text):
    encoded_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    return encoded_text

test = pd.read_csv('../data/test_dataset.csv')

In [None]:
numerize(test)

In [None]:
def predict(text_data):
    encoded_data = tokenize(text_data)
    with torch.no_grad():
        outputs = model(**encoded_data)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return predictions.cpu().numpy()

def predict_prob(text_data):
    encoded_data = tokenize(text_data)
    with torch.no_grad():
        outputs = model(**encoded_data)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)
        predictions = torch.argmax(logits, dim=-1)
    return predictions.cpu().numpy(), probabilities.cpu().numpy()

In [None]:
predictions = predict(test)

f1 = f1_score(test['label'], predictions, average='weighted')

recall = recall_score(test['label'], predictions, average='weighted')

precision = precision_score(test['label'], predictions, average='weighted')

accuracy = accuracy_score(test['label'], predictions)

print('accuracy: ' + str(accuracy))
print('recall: ' + str(recall))
print('precision: ' + str(precision))
print('f1: ' + str(f1))