In [1]:
!pip install -q transformers datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [16]:
dataset_ckpt = 'yelp_review_full'
teacher_model_ckpt = 'distilbert-base-uncased'
student_model_ckpt = 'google/bert_uncased_L-6_H-512_A-8'

In [17]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from transformers import AutoModelForSequenceClassification
from torch import nn
from torch import optim
from torch.nn import functional as F
from transformers import AutoTokenizer

In [18]:
data = load_dataset(dataset_ckpt, split = 'test')

In [19]:
#making our own sets
data = data.shuffle()
data = data.train_test_split(test_size = 0.1)
train_valid_data = data['train']
test_data = data['test']
train_valid_data = train_valid_data.train_test_split(test_size = 0.2)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [20]:
def get_num_rows(dataset):
  return dataset.num_rows

print(f'Train set has {get_num_rows(train_data)} texts')
print(f'Valid set has {get_num_rows(valid_data)} texts')
print(f'Test set has {get_num_rows(test_data)} texts')

Train set has 36000 texts
Valid set has 9000 texts
Test set has 5000 texts


In [21]:
tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [22]:
#now we would utilise pytorch's Dataset andDataloader classes to create our dataset

class MyData(Dataset):
  def __init__(self, data):
    targets = data['label']
    texts = data['text']

    tokens = tokenizer(texts, return_tensors = 'pt', truncation = True, padding = True)
    self.input_ids = tokens['input_ids']
    self.attention_mask = tokens['attention_mask']
    self.targets = torch.tensor(targets)
    self.length = len(texts)
  def __len__(self):
    return self.length
  def __getitem__(self, index):
    return self.input_ids[index],self.attention_mask[index], self.targets[index]

In [23]:
train_data = MyData(train_data)
valid_data = MyData(valid_data)
test_data = MyData(test_data)

In [24]:
# now we build our loaders
batch_size = 8
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size)
valid_loader = DataLoader(valid_data, shuffle  = True, batch_size = batch_size)
test_loader = DataLoader(test_data, shuffle = True, batch_size = batch_size)


In [25]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [26]:
from tqdm import tqdm
# this is a pytorch function to help us compute loss as we train
def accuracy_score(batch, model):
  with torch.no_grad():
    outputs = model(
        batch[0].to(device),
        batch[1].to(device)
    )
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim = 1)
    class_predictions = torch.argmax(probabilities, dim = 1)
    return torch.mean((class_predictions == batch[2].to(device)).to(torch.float)).data.item()

In [27]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_ckpt, num_labels = 5)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
epochs = 4
learning_rate = 2e-5
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(teacher_model.parameters(), lr = learning_rate)

teacher_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [29]:
for epoch in tqdm(range(epochs), total = epochs):
  #start training loader
  teacher_model.train()
  training_loss = 0.0
  training_accuracy = 0.0
  valid_loss = 0.0
  valid_accuracy = 0.0
  for a in tqdm(train_loader, total = len(train_loader)):
    optimizer.zero_grad()
    input_ids = a[0].to(device)
    attention_mask = a[1].to(device)
    target_tensors = a[2].to(device)
    output = teacher_model(input_ids = input_ids, attention_mask = attention_mask)
    loss = criterion(output.logits, target_tensors)
    loss.backward()
    optimizer.step()
    training_loss += loss.data.item()
    training_accuracy += accuracy_score(a, teacher_model)
  teacher_model.eval()
  for b in tqdm(valid_loader, total = len(valid_loader)):
    input_ids = b[0].to(device)
    attention_mask = b[1].to(device)
    target_tensors = b[2].to(device)
    output = teacher_model(input_ids = input_ids, attention_mask = attention_mask)
    loss = criterion(output.logits, target_tensors)
    valid_loss += loss.data.item()
    valid_accuracy += accuracy_score(b, teacher_model)

  training_accuracy /= len(train_loader)
  valid_accuracy /= len(valid_loader)
  training_loss /= len(train_loader)
  valid_loss /= len(valid_loader)
  #we would also test accuracy on validation
  print(f"""
    After epochs {epoch + 1}, training loss was {training_loss}, validation_loss was {valid_loss}. training_accuracy {training_accuracy} valid_accuracy {valid_accuracy}
  """)




  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/4500 [00:00<?, ?it/s][A../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
  0%|          | 0/4500 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s]


RuntimeError: unique_by_key: failed to synchronize: cudaErrorAssert: device-side assert triggered