In [1]:
!pip install transformers
!pip install datasets
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https:/

In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback)
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset


In [2]:
train_import = pd.read_json('/content/drive/MyDrive/NLP Project Files/Maddy/data/train.jsonl', lines = True)
test_import = pd.read_json('/content/drive/MyDrive/NLP Project Files/Maddy/data/validation.jsonl', lines = True)

In [3]:
train_import.columns

Index(['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs',
       'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia',
       'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags'],
      dtype='object')

In [4]:
train_import['processedParagpraphs'] = train_import['targetParagraphs'].map(lambda x: '. '.join(x))
test_import['processedParagpraphs'] = test_import['targetParagraphs'].map(lambda x: '. '.join(x))

In [5]:
train_import['processedText'] = train_import['postText'].map(lambda x: x[0])
test_import['processedText'] = test_import['postText'].map(lambda x: x[0])

In [6]:
train_import['tags'] = train_import['tags'].map(lambda x: x[0])
test_import['tags'] = test_import['tags'].map(lambda x: x[0])

In [7]:
train_import['tags'].value_counts()

phrase     1367
passage    1274
multi       559
Name: tags, dtype: int64

In [8]:
model_name = "distilbert-base-uncased"
num_labels = 3

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [9]:
train_import['training_input'] = train_import['processedText'] + tokenizer.sep_token + train_import['processedParagpraphs']
test_import['training_input'] = test_import['processedText'] + tokenizer.sep_token + test_import['processedParagpraphs']

In [10]:
train_dataset = Dataset.from_pandas(train_import)
test_dataset = Dataset.from_pandas(test_import)

In [11]:
train_dataset

Dataset({
    features: ['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs', 'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia', 'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags', 'processedParagpraphs', 'processedText', 'training_input'],
    num_rows: 3200
})

In [12]:
def tokenize(batch):
  return tokenizer(batch['training_input'], padding=True, truncation=True, max_length=512)

train_data_tokenized = train_dataset.map(tokenize, batched=True)
test_data_tokenized = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [13]:
labels = ["phrase", "passage", "multi"]
label_to_id = {l: i for i, l in enumerate(labels)}

def convert_labels(batch):
    batch["labels"] = label_to_id[batch["tags"]]
    return batch

train_data_merged = train_data_tokenized.map(convert_labels)
test_data_merged = test_data_tokenized.map(convert_labels)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
        output_dir=f"/content/drive/MyDrive/NLP Project Files/Maddy_final/{model_name}_results",
        num_train_epochs= 10,
        per_device_train_batch_size= 16 ,
        per_device_eval_batch_size= 16,
        logging_steps=100,
        save_steps=0,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        fp16=True,
    )

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience= 3)

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="macro")
    }

In [16]:
indices = np.arange(len(train_dataset))
np.random.shuffle(indices)

train_size = int(0.85 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_indices = [int(i) for i in indices[:train_size]]
val_indices = [int(i) for i in indices[train_size:]]

train_subset = torch.utils.data.Subset(train_data_merged, train_indices)
val_subset = torch.utils.data.Subset(train_data_merged, val_indices)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= train_subset,
    eval_dataset= val_subset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

trainer.train()


metrics = trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0298,0.843663,0.64375,0.586402
2,0.7712,0.855148,0.645833,0.593239
3,0.5304,0.924348,0.629167,0.592473
4,0.3475,1.212509,0.645833,0.608518
5,0.2146,1.355793,0.66875,0.62863
6,0.1286,2.029512,0.622917,0.592678
7,0.0993,2.074842,0.63125,0.598163
8,0.0261,2.122608,0.639583,0.610683


In [18]:
trainer.evaluate(eval_dataset= test_data_merged)

{'eval_loss': 1.327897548675537,
 'eval_accuracy': 0.6675,
 'eval_f1': 0.6501587491001606,
 'eval_runtime': 4.9598,
 'eval_samples_per_second': 161.296,
 'eval_steps_per_second': 10.081,
 'epoch': 8.0}

In [19]:
trainer.save_model(f"/content/drive/MyDrive/NLP Project Files/Maddy_final/{model_name}_results_best")