Loading a pre-trained BERT model

In [102]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
import pandas as pd
import numpy as np

In [103]:
model_name = "bert-large-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=13)
if torch.cuda.is_available():
    model = model.cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 53.0kB/s]
Downloading model.safetensors: 100%|██████████| 1.34G/1.34G [00:04<00:00, 313MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 14.8kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.28MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 5.13MB/s]


In [104]:
train_data = pd.read_csv('data/edu_train.csv')
dev_data = pd.read_csv('data/edu_dev.csv')
test_data = pd.read_csv('data/edu_test.csv')

In [105]:
def tokenize_text(text):
    return tokenizer(text, padding=True, truncation=True, add_special_tokens=True ,return_tensors="pt")

In [106]:
train_texts = train_data['source_article'].tolist()
train_labels = train_data['updated_label'].tolist()
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")

dev_texts = dev_data['source_article'].tolist()
dev_labels = dev_data['updated_label'].tolist()
dev_encodings = tokenizer(dev_texts, padding=True, truncation=True, return_tensors="pt")

test_texts = test_data['source_article'].tolist()
test_labels = test_data['updated_label'].tolist()
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")


In [107]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
dev_labels_encoded = label_encoder.fit_transform(dev_labels)
test_labels_encoded = label_encoder.fit_transform(test_labels)

# Ensure that tensors have the same number of samples

train_input_ids = torch.tensor(train_encodings.input_ids)
train_attention_mask = torch.tensor(train_encodings.attention_mask)
train_labels_tensor = torch.tensor(train_labels_encoded)


#if torch.cuda.is_available():
 #   train_input_ids = train_input_ids.cuda()
  #  train_attention_mask = train_attention_mask.cuda()
   # train_labels_tensor = train_labels_tensor.cuda()

dev_input_ids = torch.tensor(dev_encodings.input_ids)
dev_attention_mask = torch.tensor(dev_encodings.attention_mask)
dev_labels_tensor = torch.tensor(dev_labels_encoded)

test_input_ids = torch.tensor(test_encodings.input_ids)
test_attention_mask = torch.tensor(test_encodings.attention_mask)
test_labels_tensor = torch.tensor(test_labels_encoded)

  train_input_ids = torch.tensor(train_encodings.input_ids)
  train_attention_mask = torch.tensor(train_encodings.attention_mask)
  dev_input_ids = torch.tensor(dev_encodings.input_ids)
  dev_attention_mask = torch.tensor(dev_encodings.attention_mask)
  test_input_ids = torch.tensor(test_encodings.input_ids)
  test_attention_mask = torch.tensor(test_encodings.attention_mask)


In [110]:
def compute_metrics(pred: EvalPrediction):
    predictions, labels = pred.predictions, pred.label_ids
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True
)

In [111]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=lambda data: {
        'input_ids': torch.stack([item[0] for item in data]),
        'attention_mask': torch.stack([item[1] for item in data]),
        'labels': torch.stack([item[2] for item in data])
    },
    train_dataset=torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels_tensor),
    eval_dataset=torch.utils.data.TensorDataset(dev_input_ids, dev_attention_mask, dev_labels_tensor),

    )    

In [112]:
# Fine-tune the model on your training data
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=348, training_loss=1.7492202999948085, metrics={'train_runtime': 501.104, 'train_samples_per_second': 11.07, 'train_steps_per_second': 0.694, 'total_flos': 2403063409853484.0, 'train_loss': 1.7492202999948085, 'epoch': 3.0})

In [113]:
# Evaluate the model on the test set

results = trainer.evaluate(eval_dataset=torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels_tensor))

  _warn_prf(average, modifier, msg_start, len(result))


In [114]:
print(results)

{'eval_loss': 1.3919514417648315, 'eval_accuracy': 0.6, 'eval_f1': 0.6, 'eval_precision': 0.5407530055220289, 'eval_recall': 0.5318547332671499, 'eval_runtime': 13.2662, 'eval_samples_per_second': 22.614, 'eval_steps_per_second': 1.432, 'epoch': 3.0}


## TESTING

In [119]:
# Your own sentence
your_sentence = "I felt nauseated both times I ate pizza from Georgio’s, so I must be allergic to something in pizza. "


# Tokenize and preprocess your sentence
input_ids = tokenizer.encode(your_sentence, add_special_tokens=True, return_tensors="pt").to(model.device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

# Get the predicted cla.ss (label) for your sentence
predicted_class = torch.argmax(logits, dim=1).item()

print(predicted_class)
print(label_encoder.classes_[predicted_class])

9
false causality
