In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from datasets import Dataset

In [None]:
from data.data import train_set, test_set, sample_dataset, dataset

print(f"Size of test set: {len(test_set)}, size of train set: {len(train_set)}, no overlap: {len(train_set)+len(test_set)==len(dataset)}, size of sample (validation) set: {len(sample_dataset)}")

Size of test set: 600, size of train set: 2400, no overlap: True, size of sample (validation) set: 150


In [17]:
# load into Datasets
train_ds = Dataset.from_pandas(pd.DataFrame(data=train_set))
test_ds = Dataset.from_pandas(pd.DataFrame(data=test_set))

train_ds

Dataset({
    features: ['sentence', 'label'],
    num_rows: 2400
})

In [18]:
model_name = "google-bert/bert-base-uncased"

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(example):
    return tokenizer(example["sentence"], truncation=True)



In [20]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

tokenized_train

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2400
})

In [21]:
id2label = {0: "Liberal", 1: "Neutral", 2: "Conservative"}
label2id = {"Liberal": 0, "Neutral": 1, "Conservative": 2}

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# only train classification head
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers for less rigidity
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [34]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [32]:
LR = 2e-4
BATCH_SIZE = 16
NUM_EPOCHS = 4

In [35]:
training_args = TrainingArguments(
    output_dir="pid-ft-bert",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/600 [00:00<?, ?it/s]

{'loss': 0.8728, 'grad_norm': 5.015448570251465, 'learning_rate': 0.00015000000000000001, 'epoch': 1.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.94815593957901, 'eval_accuracy': 0.55, 'eval_runtime': 104.3562, 'eval_samples_per_second': 5.75, 'eval_steps_per_second': 0.364, 'epoch': 1.0}
{'loss': 0.8941, 'grad_norm': 1.6851606369018555, 'learning_rate': 0.0001, 'epoch': 2.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.9471595287322998, 'eval_accuracy': 0.5233333333333333, 'eval_runtime': 95.6029, 'eval_samples_per_second': 6.276, 'eval_steps_per_second': 0.397, 'epoch': 2.0}
{'loss': 0.8707, 'grad_norm': 2.7509310245513916, 'learning_rate': 5e-05, 'epoch': 3.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.945142388343811, 'eval_accuracy': 0.5133333333333333, 'eval_runtime': 110.756, 'eval_samples_per_second': 5.417, 'eval_steps_per_second': 0.343, 'epoch': 3.0}
{'loss': 0.863, 'grad_norm': 1.6474816799163818, 'learning_rate': 0.0, 'epoch': 4.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.9578546285629272, 'eval_accuracy': 0.49333333333333335, 'eval_runtime': 104.7229, 'eval_samples_per_second': 5.729, 'eval_steps_per_second': 0.363, 'epoch': 4.0}


No files have been modified since last commit. Skipping to prevent empty commit.


{'train_runtime': 2906.5365, 'train_samples_per_second': 3.303, 'train_steps_per_second': 0.206, 'train_loss': 0.8751188913981119, 'epoch': 4.0}


TrainOutput(global_step=600, training_loss=0.8751188913981119, metrics={'train_runtime': 2906.5365, 'train_samples_per_second': 3.303, 'train_steps_per_second': 0.206, 'total_flos': 372067039550304.0, 'train_loss': 0.8751188913981119, 'epoch': 4.0})

In [None]:
# model.push_to_hub("pid-ft-bert")
# tokenizer.push_to_hub("pid-ft-bert")

In [37]:
# evaluate on same sample dataset
infer_tokenizer = AutoTokenizer.from_pretrained("lhz1/pid-ft-bert")
ft_model = AutoModelForSequenceClassification.from_pretrained("lhz1/pid-ft-bert")


def run_model():
    predictions = []
    accurate = 0
    for example in sample_dataset:
        inputs = infer_tokenizer(example["sentence"], return_tensors="pt")
        label = example["label"]

        with torch.no_grad():
            logits = ft_model(**inputs).logits

            predicted_class_id = logits.argmax().item()
            if predicted_class_id == label: accurate += 1
            predictions.append(ft_model.config.id2label[predicted_class_id])

    return predictions, accurate / len(sample_dataset)

preds = run_model()

In [41]:
pred_labels = pd.DataFrame(data=preds[0])
acc = preds[1]

pred_labels.value_counts()

0           
Conservative    82
Liberal         36
Neutral         32
Name: count, dtype: int64