# Models

In [1]:
import os
import pandas as pd
from datasets import Dataset
import numpy as np
import pickle
from transformers import (AutoTokenizer,
                   DataCollatorWithPadding, 
                   AutoModelForSequenceClassification,
                   TrainingArguments,
                   Trainer)
import evaluate
import torch
from torch.nn import CrossEntropyLoss


from utils import (read_twitter_file,
                   create_datasets)

%load_ext autoreload
%autoreload 2

In [2]:
## create dataset with 100_000 samples
ds = create_datasets(sub_sampling=10_000)
ds = ds.train_test_split(test_size=.3)

## First model : BERTForSequenceClassification


#### Tokenizing the datasets

In [3]:
BERT_MODEL = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, batch_size=16)

train_ds = ds["train"].map(lambda x : tokenizer(x["tweet"], return_tensors="pt", truncation=True, padding='max_length', max_length=512))
test_ds = ds["test"].map(lambda x : tokenizer(x["tweet"], return_tensors="pt", truncation=True, padding='max_length', max_length=512))

train_ds = train_ds.remove_columns(["tweet"])
test_ds = test_ds.remove_columns(["tweet"])


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [4]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=1
)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").long()
        # forward pass
        outputs = model(input_ids=inputs["input_ids"].squeeze(1), attention_mask=inputs["attention_mask"].squeeze(1))
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="models/baseline_tweet_analysis",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

  0%|          | 0/876 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
model(**train_ds[0])

AttributeError: 'list' object has no attribute 'size'