In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DistilBertModel
from torch.utils.data import Dataset,DataLoader

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda


#### Import data

In [4]:
raw_data = load_dataset('emotion')

Using custom data configuration default
Reusing dataset emotion (/home/sabber/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
from collections import Counter
print(Counter(raw_data['train']['label']))
print(Counter(raw_data['validation']['label']))
print(Counter(raw_data['test']['label']))

Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 572})
Counter({1: 704, 0: 550, 3: 275, 4: 212, 2: 178, 5: 81})
Counter({1: 695, 0: 581, 3: 275, 4: 224, 2: 159, 5: 66})


In [6]:
train = raw_data['train'].to_pandas()
validation = raw_data['validation'].to_pandas()
test = raw_data['test'].to_pandas()

In [7]:
train.shape, validation.shape, test.shape

((16000, 2), (2000, 2), (2000, 2))

In [8]:
train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


### One hot encode lables

In [9]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [10]:
train_labels = pd.DataFrame(encoder.fit_transform(train[['label']]).toarray())
val_labels = pd.DataFrame(encoder.fit_transform(validation[['label']]).toarray())
test_labels = pd.DataFrame(encoder.fit_transform(test[['label']]).toarray())

### Define parameters

In [11]:
## Parameters
LEARNING_RATE:float = 0.001
EPOCHS:int = 10
BATCH_SIZE:int = 16
SHUFFLE:bool = True
NUM_CLASSES:int = train_labels.shape[1]
VALID_DATA_PERCENTAGE:float = 0.2

In [12]:

class EmotionDataset:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
            
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        inputs = self.tokenizer(text, max_length=self.max_len,
                                        padding="max_length", truncation=True, return_tensors='pt')
        
        return {
            "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [13]:
## Dataset
train_dataset = EmotionDataset(train['text'].tolist(), train_labels.to_numpy(), tokenizer, max_len=128)
validation_dataset = EmotionDataset(validation['text'].tolist(), val_labels.to_numpy(), tokenizer, max_len=128)
test_dataset = EmotionDataset(test['text'].tolist(), test_labels.to_numpy(), tokenizer, max_len=128)

In [14]:
### Data loader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)
val_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)
test_dataloader = DataLoader(test_dataset, batch_size=1)

In [15]:
class DistillBeterClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.distillbert = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)
        self.layer_1 = nn.Linear(768, 512)
        self.layer_2 = nn.Linear(512, NUM_CLASSES)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        
    def forward(self, ids, masks):
        distill_out = self.distillbert(ids.squeeze(), masks.squeeze())
        pooled_output = distill_out[0]
        pooled_output = pooled_output[:, 0]
        
        output = self.relu(self.dropout(pooled_output))
        output = self.relu(self.layer_1(output))
        logits = self.layer_2(output)
        return logits
        

In [16]:
model = DistillBeterClassifier().to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [18]:
def evaluate(dataloader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for bi, d in enumerate(dataloader):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]
            
            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            outputs = model(ids=ids, masks=mask)

            loss = criterion(outputs, targets)
            total_loss += loss.item()
        
        return total_loss / len(dataloader)

In [19]:

def train(dataloader):
    model.train()
    total_loss = 0.0
    for bi, d in enumerate(dataloader):
        
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(ids=ids, masks=mask)

        loss = criterion(outputs, targets.float())
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
        
    return total_loss / len(dataloader)

In [20]:
### final training
for epoch in range(1, EPOCHS +1):
    train_loss = train(train_dataloader)
    valid_loss = evaluate(val_dataloader)
    print(
        f'Epoch: {epoch} | Train Loss: {train_loss:.4f} | Valid loss: {valid_loss:.4f}'
    )

  "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
  "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
