In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from torch.nn.functional import softmax
import evaluate
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
ds = load_dataset('json', data_files={'train': 'data/news/news_train.json', 'test': 'data/news/news_eval.json'})
ds

DatasetDict({
    train: Dataset({
        features: ['Summary', 'Date', 'label'],
        num_rows: 3931
    })
    test: Dataset({
        features: ['Summary', 'Date', 'label'],
        num_rows: 360
    })
})

In [25]:
ds = ds.map(lambda ex: {'label': int(ex['label'].replace(".", "").replace(" ", ""))})
ds = ds.rename_columns({'Summary': 'text', 'label': 'labels'}).select_columns(['text', 'labels'])
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 3931
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 360
    })
})

In [18]:
ds_tr = load_dataset('json', data_files='data/news/news_train_downsample.jsonl')['train']
ds_tr = ds_tr.rename_columns({'Summary': 'summary'}).select_columns(['summary', 'label'])
ds_tr.to_json('data/news/news_train_downsample.jsonl')
ds_tr

Generating train split: 2442 examples [00:00, 386698.77 examples/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 189.97ba/s]


Dataset({
    features: ['summary', 'label'],
    num_rows: 2442
})

In [21]:
ds_te = load_dataset('json', data_files='data/news/news_eval.jsonl')['train']
ds_te = ds_te.rename_columns({'Summary': 'summary'}).select_columns(['summary', 'label'])
ds_te.to_json('data/news/news_eval.jsonl')
ds_te

Generating train split: 3931 examples [00:00, 365227.03 examples/s]
Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 171.90ba/s]


Dataset({
    features: ['summary', 'label'],
    num_rows: 3931
})

# **1 - BERT**

In [8]:
model_name = "bert"
model_path = "google-bert/bert-base-uncased"

In [9]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

def tokenize(example):
    return tokenizer(example['Summary'], padding="max_length", truncation=True)



In [13]:
# Tokenize
tok_ds = ds.map(tokenize, batched=True)

# Format columns
tok_ds = tok_ds.rename_columns({'label': 'labels'})
tok_ds = tok_ds.select_columns(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])
tok_ds = tok_ds.with_format('torch')

tok_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3931
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 360
    })
})

In [14]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)



KeyboardInterrupt: 

In [None]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = softmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir=f"models/news_classifier/{model_name}",
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tok_ds['train'],
    eval_dataset=tok_ds['test'],
    compute_metrics=compute_metrics
)