In [1]:
!pip install transformers
!pip install datasets
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, ht

In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback)
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification

In [2]:
train_import = pd.read_json('/content/drive/MyDrive/NLP Project Files/Maddy/data/train.jsonl', lines = True)
test_import = pd.read_json('/content/drive/MyDrive/NLP Project Files/Maddy/data/validation.jsonl', lines = True)

In [3]:
train_import.columns

Index(['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs',
       'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia',
       'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags'],
      dtype='object')

In [4]:
train_import['processedParagpraphs'] = train_import['targetParagraphs'].map(lambda x: '. '.join(x))
test_import['processedParagpraphs'] = test_import['targetParagraphs'].map(lambda x: '. '.join(x))

In [5]:
train_import['processedText'] = train_import['postText'].map(lambda x: x[0])
test_import['processedText'] = test_import['postText'].map(lambda x: x[0])

In [6]:
train_import['tags'] = train_import['tags'].map(lambda x: x[0])
test_import['tags'] = test_import['tags'].map(lambda x: x[0])

In [7]:
train_import['tags'].value_counts()

phrase     1367
passage    1274
multi       559
Name: tags, dtype: int64

In [8]:
model_name = "allenai/longformer-base-4096"
num_labels = 3

model = LongformerForSequenceClassification.from_pretrained(model_name, attention_window = 512,gradient_checkpointing= True , num_labels=num_labels)
tokenizer = LongformerTokenizerFast.from_pretrained(model_name,max_length = 3584)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weigh

In [9]:
train_import['training_input'] = train_import['processedText'] + tokenizer.sep_token + train_import['processedParagpraphs']
test_import['training_input'] = test_import['processedText'] + tokenizer.sep_token + test_import['processedParagpraphs']

In [10]:
train_dataset = Dataset.from_pandas(train_import)
test_dataset = Dataset.from_pandas(test_import)

In [11]:
train_dataset

Dataset({
    features: ['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs', 'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia', 'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags', 'processedParagpraphs', 'processedText', 'training_input'],
    num_rows: 3200
})

In [12]:
def tokenize(batch):
  return tokenizer(batch['training_input'], padding='max_length', truncation=True, max_length = 3584)

train_data_tokenized = train_dataset.map(tokenize, batched=True)
test_data_tokenized = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [13]:
labels = ["phrase", "passage", "multi"]
label_to_id = {l: i for i, l in enumerate(labels)}

def convert_labels(batch):
    batch["labels"] = label_to_id[batch["tags"]]
    return batch

train_data_merged = train_data_tokenized.map(convert_labels)
test_data_merged = test_data_tokenized.map(convert_labels)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
        output_dir=f"/content/drive/MyDrive/NLP Project Files/Maddy_final/{model_name}_results",
        num_train_epochs= 10,
        per_device_train_batch_size= 8,
        per_device_eval_batch_size= 8,
        gradient_accumulation_steps = 8,  
        logging_steps=100,
        save_steps=0,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        fp16=True,
        weight_decay=0.01
    )

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience= 3)

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="macro")
    }

In [16]:
from sklearn.model_selection import train_test_split

labels = train_data_merged['labels']

train_indices, val_indices = train_test_split(
    np.arange(len(train_data_merged)),
    test_size=0.15,
    random_state=42,
    stratify=labels,
)

train_indices = [int(x) for x in train_indices]
val_indices = [int(x) for x in val_indices]

train_subset = torch.utils.data.Subset(train_data_merged, train_indices)
val_subset = torch.utils.data.Subset(train_data_merged, val_indices)

In [17]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= train_subset,
    eval_dataset= val_subset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

trainer.train()


metrics = trainer.evaluate()

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.952761,0.51875,0.44313
2,No log,0.806379,0.645833,0.626497
2,0.902300,0.799178,0.677083,0.659974
4,0.902300,0.937634,0.65625,0.64508


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.952761,0.51875,0.44313
2,No log,0.806379,0.645833,0.626497
2,0.902300,0.799178,0.677083,0.659974
4,0.902300,0.937634,0.65625,0.64508
4,0.500400,1.161288,0.635417,0.624782
6,0.500400,1.319621,0.633333,0.62959


In [18]:
trainer.evaluate(eval_dataset= test_data_merged)

{'eval_loss': 0.7565586566925049,
 'eval_accuracy': 0.6875,
 'eval_f1': 0.6756225403087633,
 'eval_runtime': 175.1247,
 'eval_samples_per_second': 4.568,
 'eval_steps_per_second': 0.571,
 'epoch': 6.0}

In [19]:
trainer.save_model(f"/content/drive/MyDrive/NLP Project Files/Maddy_final/{model_name}_results_best")