Loading a pre-trained BERT model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [None]:
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=13)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_data = pd.read_csv('data/edu_train.csv')
dev_data = pd.read_csv('data/edu_dev.csv')
test_data = pd.read_csv('data/edu_test.csv')

In [None]:
def tokenize_text(text):
    return tokenizer(text, padding=True, truncation=True, add_special_tokens=True ,return_tensors="pt")

In [None]:
train_texts = train_data['masked_articles'].tolist()
train_labels = train_data['updated_label'].tolist()
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")

dev_texts = dev_data['masked_articles'].tolist()
dev_labels = dev_data['updated_label'].tolist()
dev_encodings = tokenizer(dev_texts, padding=True, truncation=True, return_tensors="pt")

test_texts = test_data['masked_articles'].tolist()
test_labels = test_data['updated_label'].tolist()
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")


In [None]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
dev_labels_encoded = label_encoder.fit_transform(dev_labels)
test_labels_encoded = label_encoder.fit_transform(test_labels)
# Ensure that tensors have the same number of samples

train_input_ids = torch.tensor(train_encodings.input_ids)
train_attention_mask = torch.tensor(train_encodings.attention_mask)
train_labels_tensor = torch.tensor(train_labels_encoded)


dev_input_ids = torch.tensor(dev_encodings.input_ids)
dev_attention_mask = torch.tensor(dev_encodings.attention_mask)
dev_labels_tensor = torch.tensor(dev_labels_encoded)

test_input_ids = torch.tensor(test_encodings.input_ids)
test_attention_mask = torch.tensor(test_encodings.attention_mask)
test_labels_tensor = torch.tensor(test_labels_encoded)

print(train_labels_tensor)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir='./logs',
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import EvalPrediction

def compute_metrics(pred: EvalPrediction):
    predictions, labels = pred.predictions, pred.label_ids
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=lambda data: {
        'input_ids': torch.stack([item[0] for item in data]),
        'attention_mask': torch.stack([item[1] for item in data]),
        'labels': torch.stack([item[2] for item in data])
    },
    train_dataset=torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels_tensor),
    eval_dataset=torch.utils.data.TensorDataset(dev_input_ids, dev_attention_mask, dev_labels_tensor)
    )
    

In [None]:
# Fine-tune the cmodel on your training data
#trainer.train()

In [None]:
# Evaluate the model on the test set

#results = trainer.evaluate(eval_dataset=torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels_tensor))

In [None]:
print(results)

## TESTING

In [None]:
# Your own sentence
your_sentence = "Your input sentence goes here."

# Tokenize and preprocess your sentence
input_ids = tokenizer.encode(your_sentence, add_special_tokens=True, return_tensors="pt")


In [None]:
model.eval()
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

# Get the predicted class (label) for your sentence
predicted_class = torch.argmax(logits, dim=1).item()

print(predicted_class)