
Class torch.nn.Module [SOURCE]
bASE CLASS FOR ALL NEURAL NETWORK MODULES.
YOUR MODELS SHOULD ALSO SUBCLASS THIS CLASS
MODULES CAN ALSO CONTAIN OTHER MODULES, ALLOWING TO NEST THEM IN  A TREE STRUCTURE. 
YOU CAN ASSIGN THE SUBMODULES AS REGULAR ATTRIBUTES


In [2]:
!pip install torch

Collecting torch
  Using cached torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch)
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch)
  Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch)
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch)
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
Installing collected packages: nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cublas-cu11, nvidia-cudnn-cu11, torch
Successfully installed nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 torch-1.13.1
[0m

In [3]:
! pip install transformers datasets

Collecting transformers
  Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting datasets
  Using cached datasets-2.13.1-py3-none-any.whl (486 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Using cached huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
Collecting packaging>=20.0 (from transformers)
  Using cached packaging-23.1-py3-none-any.whl (48 kB)
Installing collected packages: tokenizers, safetensors, xxhash, tqdm, pack

In [4]:
import math


import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [5]:
class Model (nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,20,5)
        self.conv2 = nn.Conv2d(20,20,5)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_k, d_model, n_heads):
        super().__init__()
        self.d_k = d_k
        self.n_heads = n_heads
        
        # this is for implementing the following formula
        # Attention(Q,K,V)  = softmax((QK^T)/sqrt(dk))V 
        self.key = nn.Linear(d_model, d_k*n_heads)
        self.query = nn.Linear(d_model, d_k*n_heads)
        self.value = nn.Linear(d_model, d_k*n_heads)
        
        #final linear layer
        self.fc = nn.Linear(d_k*n_heads, d_model)
    
    def forward(self, q,k,v, mask =None):
        
        q = self.query(q) # NxT x (hd_k)
        k = self.key(k) # N x T x (hd_k)
        v = self.value(v) # N x T x (hd_v)
        
        N = q.shape[0]
        T = q.shape[1]
        
        q = q.view(N,T,self.n_heads, self.d_k).transpose(1,2)
        k = k.view(N,T,self.n_heads, self.d_k).transpose(1,2)
        v = v.view(N,T,self.n_heads, self.d_k).transpose(1,2)
        
        # Compute attention weights
        # (N,h,T,d_k) X (N,h,d_k,T) --> (N,h,T,T)
        # in transpose switch dimension 2 and 1, 
        """
        # Its like doing this:
        for n in range(N):
            for head, in range(h):
                score[n,head] = q[n,head]@ k[n,head].T
                # (T, d_k) X (d_k,T) -> (T,T)
        """
        
        attn_scores = q @ k.transpose(-2,-1) / math.sqrt(self.d_k)
        
        if (not mask is None):
            #(N,T) -> (N,1,1,T)
            #for correct broadcasting, it applies the mask in the aprropriate places
            # wherever the first argument is true apply the value of the second argument, 
            # in this case, infinity(for softmax purposes)
            attn_scores = attn_scores.masked_fill(
                mask[:, None, None, :] == 0, float('-inf'))
        attn_weights = F.softmax(attn_scores, dim=-1)
        
        # compute attention-weighted values
        # (N,h,T, T) X (N, h, T, d_k) -->  (N, h, , T, d_k)
        
        A = attn_weights @ v

        A = A.transpose(1,2) # (N,T,h,d_k)
        A = A.contiguous().view(N, T, self.d_k*self.n_heads) # (N,T,h*d_k)
        
        return self.fc(A)
        
        
        
        

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, dropout_prob =0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_k, d_model, n_heads)
        self.ann = nn.Sequential(
            nn.Linear(d_model, d_model*4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout_prob))
        self.dropout = nn.Dropout(p=dropout_prob)
        
    def forward(self, x, mask=None):
        # Passing x as query, key and value , thats why 3 x
        
        x = self.ln1(x + self.mha(x, x, x, mask))
        x = self.ln2(x + self.ann(x))
        x = self.dropout(x)
        return x

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model, max_len =2048, dropout_prob = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout_prob)
        
        position = torch.arange(max_len).unsqueeze(1)
        exp_term = torch.arange(0,d_model,2)
        div_term = torch.exp(exp_term * (-math.log(1000.0)/d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position*div_term)
        pe[0, :, 1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # x.shape: NxTxD
        x= x+ self.pe[:,:x.size(1), :]
        return self.dropout(x)

In [9]:
class Encoder(nn.Module):
    def __init__(self,
                  vocab_size,
                  max_len,
                  d_k,
                  d_model,
                  n_heads,
                  n_layers,
                  n_classes,
                  dropout_prob):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        transformer_blocks =[
            TransformerBlock(
            d_k,
            d_model,
            n_heads,
            dropout_prob) for _ in range(n_layers)]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, n_classes)
        
    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x=block(x,mask)
            
        # many-to-one (x has the shape N x T x D)
        x = x[:,0,:]    
        x = self.fc(x)
        return x

In [10]:
model = Encoder(20_000, 1024,16,64,4,2,5,0.1)

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda:0


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [12]:
x = np.random.randint(0,20_000, size = (8,512))
x_t = torch.tensor(x).to(device)

In [13]:
mask = np.ones((8,512))
mask[:,256:]=0
mask_t = torch.tensor(mask).to(device)

In [14]:
y = model(x_t,mask_t)

In [15]:
y.shape

torch.Size([8, 5])

In [16]:
!pip install transformers datasets

[0m

In [17]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [18]:
checkpoint = 'distilbert-base-cased' # does not use segment embeddings - sentences separated by special tokens
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [19]:
from datasets import load_dataset

In [20]:
raw_datasets = load_dataset("glue", "sst2") #dataset for sentiment analysis glue benchmark

Found cached dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [22]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation = True)

In [23]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-260390e2989cde42.arrow


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-05d706196ce03b81.arrow


In [24]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [25]:
tokenized_datasets
#attention mask indicates wich tokens are real tokens and not padding

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [26]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [28]:
from torch.utils.data import DataLoader

In [29]:
train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    batch_size=32,
    collate_fn = data_collator)

In [30]:
valid_loader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=32,
    collate_fn = data_collator)

In [31]:
for batch in train_loader:
    for k,v in batch.items():
        print("k:",k, "v.shape:", v.shape)
    break

    

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k: labels v.shape: torch.Size([32])
k: input_ids v.shape: torch.Size([32, 46])
k: attention_mask v.shape: torch.Size([32, 46])


In [32]:
set (tokenized_datasets['train']['labels'])

{0, 1}

In [33]:
tokenizer.vocab_size

28996

In [34]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [35]:
model = Encoder(
    vocab_size = tokenizer.vocab_size,
    max_len = tokenizer.max_model_input_sizes[checkpoint],
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    n_classes = 2,
    dropout_prob = 0.1)

In [36]:
model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [37]:
#Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [38]:
from datetime import datetime

In [43]:
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    
    for it in range(epochs):
        model.train()
        t0 = datetime.now()
        train_loss = 0
        n_train = 0
        for batch in train_loader:
            #move data to gpu
            batch = {k: v.to(device) for k, v in batch.items()}
            
            #zero the parameter gradients
            optimizer.zero_grad()
            
            #Forward pass
            outputs = model (batch['input_ids'], batch['attention_mask'])
            loss = criterion(outputs, batch['labels'])
            
            #Backward and optimize
            loss.backward()
            optimizer.step()
            
            # loss is the average crosentropy over the batch
            # multiply the loss with the full batch size in order to posteriorly take de average loss for the whole train set
            train_loss += loss.item()*batch['input_ids'].size(0)
            n_train += batch['input_ids'].size(0)
            
        #Get average train loss
        train_loss = train_loss/n_train
        
        model.eval()
        test_loss = 0
        n_test = 0
        
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model (batch['input_ids'], batch['attention_mask'])
            loss = criterion(outputs, batch['labels'])
            test_loss += loss.item()*batch['input_ids'].size(0)
            n_test += batch['input_ids'].size(0)
        test_loss = test_loss/n_test
        
        # Save losses
        
        train_losses[it] = train_loss
        test_losses[it] =  test_loss
        
        dt = datetime.now() - t0
        print (f'Epoch {it+1}/{epochs}, Train Loss: {train_loss: .4f},\
        Test Loss {test_loss:.4f}, Duration {dt}')
    return train_losses, test_losses
            
            
            

In [44]:
train_losses, test_losses = train(
    model, criterion, optimizer, train_loader, valid_loader, epochs=4)

Epoch 1/4, Train Loss:  0.2996,        Test Loss 0.5081, Duration 0:00:20.324413
Epoch 2/4, Train Loss:  0.2616,        Test Loss 0.6129, Duration 0:00:20.271290
Epoch 3/4, Train Loss:  0.2294,        Test Loss 0.5449, Duration 0:00:20.344591
Epoch 4/4, Train Loss:  0.2092,        Test Loss 0.5513, Duration 0:00:20.281684


In [48]:
# Accuracy
model.eval()
n_correct = 0
n_total = 0
for batch in train_loader:
    # Move to GPU
    batch = {k: v.to(device) for k,v in batch.items()}
    
    #Forward pass
    outputs  = model (batch['input_ids'], batch['attention_mask'])
    
    # Get prediction
    # torch.max returns both max and argmax
    
    _, predictions = torch.max(outputs, 1)
    
    #update counts
    n_correct += (predictions == batch['labels']).sum().item()
    n_total += batch['labels'].shape[0]
    
train_acc = n_correct /n_total
n_correct = 0
n_total = 0

for batch in valid_loader:
    
    # Move to GPU
    batch = {k: v.to(device) for k,v in batch.items()}
    
    #Forward pass
    outputs = model(batch['input_ids'], batch['attention_mask'])
    
    # Get Prediction
    # torch.max returns both max and argmax
    
    _, predictions = torch.max(outputs, 1)
    
    #update counts
    n_correct += (predictions == batch['labels']).sum().item()
    n_total += batch['labels'].shape[0]
    
test_acc = n_correct /n_total
print(f'Train acc: {train_acc: .4f}, Test acc: {test_acc: .4f}') 
    

    

Train acc:  0.9425, Test acc:  0.7844
