In [1]:
!pip install transformers
!pip install datasets
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback)
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset


In [3]:
train_import = pd.read_json('/content/drive/MyDrive/NLP Project Files/Maddy/data/train.jsonl', lines = True)
test_import = pd.read_json('/content/drive/MyDrive/NLP Project Files/Maddy/data/validation.jsonl', lines = True)

In [4]:
train_import.columns

Index(['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs',
       'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia',
       'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags'],
      dtype='object')

In [5]:
train_import['processedParagpraphs'] = train_import['targetParagraphs'].map(lambda x: '. '.join(x))
test_import['processedParagpraphs'] = test_import['targetParagraphs'].map(lambda x: '. '.join(x))

In [6]:
train_import['processedText'] = train_import['postText'].map(lambda x: x[0])
test_import['processedText'] = test_import['postText'].map(lambda x: x[0])

In [7]:
train_import['tags'] = train_import['tags'].map(lambda x: x[0])
test_import['tags'] = test_import['tags'].map(lambda x: x[0])

In [8]:
train_import['tags'].value_counts()

phrase     1367
passage    1274
multi       559
Name: tags, dtype: int64

In [9]:
model_name = "bert-base-uncased"
num_labels = 3

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
train_import['training_input'] = train_import['processedText'] + tokenizer.sep_token + train_import['processedParagpraphs']
test_import['training_input'] = test_import['processedText'] + tokenizer.sep_token + test_import['processedParagpraphs']

In [11]:
train_dataset = Dataset.from_pandas(train_import)
test_dataset = Dataset.from_pandas(test_import)

In [12]:
train_dataset

Dataset({
    features: ['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs', 'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia', 'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags', 'processedParagpraphs', 'processedText', 'training_input'],
    num_rows: 3200
})

In [13]:
def tokenize(batch):
  return tokenizer(batch['training_input'], padding=True, truncation=True, max_length=512)

train_data_tokenized = train_dataset.map(tokenize, batched=True)
test_data_tokenized = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [14]:
labels = ["phrase", "passage", "multi"]
label_to_id = {l: i for i, l in enumerate(labels)}

def convert_labels(batch):
    batch["labels"] = label_to_id[batch["tags"]]
    return batch

train_data_merged = train_data_tokenized.map(convert_labels)
test_data_merged = test_data_tokenized.map(convert_labels)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [16]:
training_args = TrainingArguments(
        output_dir=f"/content/drive/MyDrive/NLP Project Files/Maddy_final/{model_name}_results",
        num_train_epochs= 10,
        per_device_train_batch_size= 16 ,
        per_device_eval_batch_size= 16,
        logging_steps=100,
        save_steps=0,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        fp16=True,
    )

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience= 3)

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="macro")
    }

In [18]:
indices = np.arange(len(train_dataset))
np.random.shuffle(indices)

train_size = int(0.85 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_indices = [int(i) for i in indices[:train_size]]
val_indices = [int(i) for i in indices[train_size:]]

train_subset = torch.utils.data.Subset(train_data_merged, train_indices)
val_subset = torch.utils.data.Subset(train_data_merged, val_indices)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= train_subset,
    eval_dataset= val_subset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

trainer.train()


metrics = trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0326,0.933341,0.489583,0.413021
2,0.8886,0.780476,0.672917,0.651164
3,0.6215,0.811128,0.66875,0.653842
4,0.3718,1.205534,0.63125,0.610587
5,0.1591,1.596825,0.65,0.634856
6,0.0746,1.966187,0.65,0.63729


In [20]:
trainer.evaluate(eval_dataset= test_data_merged)

{'eval_loss': 0.891771674156189,
 'eval_accuracy': 0.63,
 'eval_f1': 0.6116233552077052,
 'eval_runtime': 9.6195,
 'eval_samples_per_second': 83.164,
 'eval_steps_per_second': 5.198,
 'epoch': 6.0}

In [21]:
trainer.save_model(f"/content/drive/MyDrive/NLP Project Files/Maddy_final/{model_name}_results_best")