In [1]:
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('AB_tagged_train.csv')
df = df[['A_tagged', 'label_binary']]
df

Unnamed: 0,A_tagged,label_binary
0,"90 percent of <MISC>Americans</MISC> ""support ...",1
1,Last year was one of the deadliest years ever ...,0
2,<PER>Bernie</PER> <PER>Sanders</PER>'s plan is...,0
3,Voter ID is supported by an overwhelming major...,1
4,"Says <PER>Barack</PER> <PER>Obama</PER> ""robbe...",0
...,...,...
18364,18 million illegal immigrants got their govern...,0
18365,Says restoring <LOC>Georgia</LOC> pre-k to a 1...,1
18366,There is clear legal authority to handcuff and...,0
18367,Says <PER>George</PER> <PER>Washington</PER> s...,0


In [3]:

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

special_tokens = ['[PER_]', '[/PER_]', '[MISC_]', '[/MISC_]', '[LOC_]', '[/LOC_]', '[ORG_]', '[/ORG_]']
# add to tokenizer
tokenizer.add_tokens(special_tokens, special_tokens=True)

# assume df is your DataFrame
X = df["A_tagged"].tolist()
y = df["label_binary"].tolist()

# train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)


In [4]:
# creating custom pytorch dataset (correct format for bert)
class check(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings # tokenized input
        self.labels = labels

    def __len__(self): # to stop iterations
        return len(self.labels)

    def __getitem__(self, index): # gets data at index i
        item = {}
        for key in self.encodings: 
            item[key] = torch.tensor(self.encodings[key][index])
        item["labels"] = torch.tensor(self.labels[index]) 
        return item # "input_ids": tensor([...]), "attention_mask": tensor([...]), "labels": tensor(0)


In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.resize_token_embeddings(len(tokenizer))

# wrap tokenized data in the custom dataset
train_dataset = check(train_encodings, y_train)
val_dataset = check(val_encodings, y_val)

# set up training arguments
training_args = TrainingArguments(
    output_dir="./results", # save model files here
    evaluation_strategy="epoch", # evaluate at end of each epoch
    logging_strategy="epoch", # log at end of each epoch
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2, # number of epochs
    weight_decay=0.01, # regularization to prevent overfitting
    logging_dir="./logs", # for tensorboard/logs
)

# check if GPU is available
print("GPU available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# initialize Huggingface Trainer
trainer = Trainer( # connects model, args, and data into one Trainer object
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train() # train the model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


GPU available: True
Device: NVIDIA GeForce RTX 4060


Epoch,Training Loss,Validation Loss
1,0.5944,0.561927
2,0.4723,0.589311


TrainOutput(global_step=1838, training_loss=0.5333720675230804, metrics={'train_runtime': 5180.0663, 'train_samples_per_second': 5.674, 'train_steps_per_second': 0.355, 'total_flos': 5165291405516400.0, 'train_loss': 0.5333720675230804, 'epoch': 2.0})

In [6]:
# load test data
df_test = pd.read_csv("AB_tagged_test.csv")
X_test = df_test["A_tagged"].tolist()
y_true = df_test["label_binary"].tolist()

# tokenize
test_encodings = tokenizer(X_test, truncation=True, padding=True)
test_dataset = check(test_encodings, y_true)

# predict
outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(axis=1)

# report results
print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred))

Accuracy: 0.7060104529616724

Classification Report:

              precision    recall  f1-score   support

           0       0.79      0.67      0.72      1323
           1       0.63      0.76      0.69       973

    accuracy                           0.71      2296
   macro avg       0.71      0.71      0.70      2296
weighted avg       0.72      0.71      0.71      2296

