In [1]:
!pip install -q transformers[torch] datasets wandb

In [2]:
dataset_ckpt = 'zeroshot/twitter-financial-news-topic'
teacher_model_ckpt = 'roberta-base'
student_model_ckpt = 'google/bert_uncased_L-6_H-512_A-8'

In [3]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from transformers import AutoModelForSequenceClassification
from torch import nn
from torch import optim
from torch.nn import functional as F
from transformers import AutoTokenizer

In [4]:
data = load_dataset(dataset_ckpt)

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16990
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4117
    })
})

In [6]:
train_test = data['train'].train_test_split(test_size = 0.1)


In [7]:
test_data = train_test['test']
train_data = train_test['train']
valid_data = data['validation']

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)

In [9]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

In [10]:
train_data_encoded = train_data.map(tokenize, batched = True, batch_size = None)
valid_data_encoded = valid_data.map(tokenize, batched = True, batch_size = None)
test_data_encoded = test_data.map(tokenize, batched = True, batch_size = None)

Map:   0%|          | 0/15291 [00:00<?, ? examples/s]

Map:   0%|          | 0/1699 [00:00<?, ? examples/s]

In [11]:
train_data_encoded

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 15291
})

In [12]:
from transformers import AutoModelForSequenceClassification

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')

In [14]:
from torch import nn
drop = nn.Dropout(p = 0.5)

In [15]:
num_labels = 20
model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_ckpt, num_labels = num_labels
).to(device)
model.dropout = drop

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [17]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average = 'weighted')
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds,average = 'weighted')
  recall = recall_score(labels, preds, average = 'weighted')
  return {'accuracy': acc, "f1":f1, "precision":precision, "recall": recall}

In [24]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
from transformers import Trainer, TrainingArguments

In [23]:
batch_size = 16
logging_steps = train_data.num_rows // batch_size

model_name = f"roberta-based_uncased-finetuned-financial-headline"
training_args = TrainingArguments(
    output_dir = model_name,
    num_train_epochs = 3,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy='steps',
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True,
    log_level = "error",
    report_to = 'tensorboard'

)

In [24]:
TrainingArguments?

In [25]:
trainer = Trainer(model = model, args = training_args,
                  compute_metrics = compute_metrics,
                  train_dataset = train_data_encoded,
                  eval_dataset = valid_data_encoded,
                  tokenizer = tokenizer)



In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
trainer.push_to_hub('Training Complete')

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import BertForSequenceClassification, BertConfig

In [None]:
config = BertConfig(classifier_dropout = 0.5).from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
model = BertForSequenceClassification(config).from_pretrained('bert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from torch import nn
drop = nn.Dropout(p = 0.5)

In [None]:
model.dropout = drop

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,