# GPT mini mini: Treinando meu primeiro transformer de Zero (Parte 1)

- Motivação 
- Setup de desenvolvimento
- Creação de Dataloader para Pytorch
- Definição do problema de modelos de linguagem 
- Definição do modelo
- Codificação posicional 
- Generação de texto
- Mecanismo de atenção
- Modelo GPT


In [1]:
from IPython.display import display, HTML
display(HTML(
"""
<a target="_blank" href="https://colab.research.google.com/github/pedrodiamel/gpt_mini_mini/blob/main/books/gpt_mini_mini_dev.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
"""
))

In [2]:
!nvidia-smi

Tue Sep  5 01:14:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4500    On   | 00000000:01:00.0  On |                  Off |
| 30%   37C    P5    57W / 200W |    613MiB / 20470MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A4500    On   | 00000000:02:00.0 Off |                  Off |
| 30%   31C    P8    12W / 200W |     10MiB / 20470MiB |      0%      Default |
|       

In [None]:
using_colab = False

In [None]:
if using_colab:
    import torch
    print("PyTorch version:", torch.__version__)
    print("CUDA is available:", torch.cuda.is_available())
    import sys
    #!pip install git+https://github.com/pedrodiamel/gpt_mini_mini
    !{sys.executable} -m pip install "git+https://github.com/pedrodiamel/gpt_mini_mini"
    !mkdir -p /.datasets/llms/brasiliansong/
    !wget -P /.datasets/llms/brasiliansong/ https://raw.githubusercontent.com/pedrodiamel/gpt_mini_mini/main/data/brasiliansong/input.txt
    

In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# TORCH MODULE
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# LOCAL MODULE
from llms.datasets.datasets import CharDataset
from llms.transformer import NeuralNetTransformer

plt.ion()   # interactive mode

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
!ls /.datasets/llms/brasiliansong

500-greatest-songs-of-all-time.zip  input_000.txt
bad-bunny-lyrics.zip		    input_001.txt
bossa-nova-lyrics		    input_spotify.txt
bossa-nova-lyrics.zip		    input.txt
brazilian-songs-lyrics		    letras-de-musicas-brasileiras.zip
brazilian-songs-lyrics.zip	    letras-de-rap-en-español.zip
datasets.txt			    spotify-million-song
eminem-lyrics-from-all-albums.zip   spotify-million-song-dataset.zip


In [4]:
DATASET_PATH = "/.datasets/llms/brasiliansong/input.txt" 

In [5]:
with open(DATASET_PATH, "r", encoding="utf-8") as f:
    data = f.read()

In [6]:
print("quantidade de caracteres", len(data))

quantidade de caracteres 8990763


In [7]:
print(data[:500])

10 Beijos de Rua:
Acabou outra vez. 
Foi cena repetida. 
Um tchau com gosto de fica. 
Me chamaram aqui pra sair. 
Nem no clima eu to. 
Mas só de raiva eu vou. 
Meu coração nem ia. 
Mas só que a teimosia. 
Chegou em mim parou, parou. 
Na primeira boca já senti remorso. 
Beijando mal de propósito. 
Torcendo pra acabar. 
O que eu nem devia começar. 
10 de beijos de rua. 
Não valem metade do seu. 
No canto da boca. 
Imagina na boca. 
10 beijos de rua. 
Não causa efeito. 
De quando me abraça com roup


In [24]:
voc = sorted(list(set(data)))
print(len(voc))
print("".join(voc))

160
	
 !"#$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡ª°²´º¿ÀÁÂÃÇÈÉÊËÍÒÓÔÕÚßàáâãäçèéêëìíîïñòóôõöùúûüāœ​–—―‘’“”„…♪♫ﬂ﻿


In [10]:
# voc = [w for l in data.split("\n") for w in l.split(" ")]
# voc = sorted(list(set(voc)))

# print(len(voc))
# print("".join(voc))

In [20]:
# vamos tokenizar 
stoi = {s:i for i,s in enumerate(voc)}
itos = {i:s for i,s in enumerate(voc)}

encoder = lambda s: [stoi[c] for c in s] 
decoder = lambda t: "".join([itos[i] for i in t])

print(encoder("Olá pessoal"))
print(decoder(encoder("Olá pessoal")))


[47, 75, 121, 2, 79, 68, 82, 82, 78, 64, 75]
Olá pessoal


In [23]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
print("tamanho do voc", enc.n_vocab)
print(enc.encode("Olá pessoal"))
print(enc.decode(enc.encode("Olá pessoal")))


tamanho do voc 50257
[30098, 6557, 279, 408, 78, 282]
Olá pessoal


In [25]:

import torch
from torch.utils.data import Dataset


class CharDataset(Dataset):

    def __init__(self, pathname, block_size, train=True, download=False):
        """CharDataset
        Args:
            pathname (str): path to dataset
            block_size (int): block size for context window
            train (bool): train or test
            download (bool): download dataset if not found
        Ref:
            https://github.com/facebookresearch/xformers/blob/main/examples/microGPT.py
        """

        if not os.path.isfile(pathname):
            raise FileNotFoundError("Dataset not found.")

        with open(pathname, "r", encoding="utf-8") as f:
            data = f.read()

        voc = sorted(list(set(data)))
        vocab_size = len(voc)

        self.stoi = {ch: i for i, ch in enumerate(voc)}
        self.itos = {i: ch for i, ch in enumerate(voc)}

        n = int(0.9 * len(data)) # 90% para treinamento (99% treinamento)
        data = data[:n] if train else data[n:]
        data_size = len(data)

        self.pathname = pathname
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
        self.count = data_size
        self.voc = voc

    def __len__(self):
        return self.count - self.block_size

    def __getitem__(self, i):
        chunk = self.data[i : i + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]

        # src and target are off by one, we want the model to predict the next word
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

    def to_tokens(self, message, device):
        return torch.tensor([self.stoi[s] for s in message], dtype=torch.long)[None, ...].to(device)

    def from_tokens(self, tokens):
        return "".join([self.itos[int(i)] for i in tokens])


In [28]:
block_size=8
dataset = CharDataset(DATASET_PATH, block_size, train=False)
print(len(dataset))

899069


In [29]:
x,y = dataset[0]
print(x, dataset.from_tokens(x))
print(y, dataset.from_tokens(y))

tensor([76, 68,  2, 68, 77, 70, 64, 77]) me engan
tensor([68,  2, 68, 77, 70, 64, 77, 78]) e engano


In [30]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler

batch_size = 4
block_size = 8 # context windows 
workers = 1

# Create dataset
dataset = CharDataset(DATASET_PATH, block_size, train=False)

# Load data
dataloader = DataLoader(
    dataset,
    batch_size=4,
    sampler=RandomSampler(dataset),
    num_workers=workers,
    pin_memory=False,
    drop_last=True,
)

print(len(dataloader))
print(len(dataset)/4)

224767
224767.25


In [37]:

# for epoch in range(epochs)
# uma epochs 
for x,y in dataloader:
    print("x \in ", x.shape)
    print("y \in ", y.shape)

    # yh = f(x)_w = [yh]_[B,T,Cv]  
    # J = CE(y, yh)

    # pytorch opt
    # W^(t+1) = W^t - lr*grad(J) (*) 
    # opt.zero_grad()
    # J.backword()
    # opt.step()
    
    break


x \in  torch.Size([4, 8])
y \in  torch.Size([4, 8])


In [41]:


import torch
import torch.nn as nn
from torch.nn import functional as F

class GPTmm(nn.Module):

    def __init__(self, voc_size):
        super().__init__()

        k=voc_size
        self.emb = nn.Embedding(voc_size, k)
    

    def forward(self, x):
        # [x]_(B,T)
        # [yh]_(B,T,Cv)
        yh = self.emb(x)

        
        return yh


voc_size = dataset.vocab_size
f = GPTmm(voc_size)
print(x.shape)
yh = f(x)

print("yh", yh.shape)



torch.Size([4, 8])
yh torch.Size([4, 8, 160])


In [42]:
opt = torch.optim.AdamW( f.parameters(), lr=1e-3 )

In [45]:

def train(max_iter=100):

    f.train()
    for i,(x,y) in enumerate(dataloader):
        # yh = f(x)_w = [yh]_(B,T,Cv) (*) Attention 
        # J = CE(y, yh)
    
        yh = f(x) # [yh]_(B,T,Cv)
        J = F.cross_entropy(yh.view(-1, voc_size), y.view(-1))
    
        # pytorch opt
        # W^(t+1) = W^t - lr*grad(J) (1)
        # opt.zero_grad()
        # J.backward() => grad(J)
        # opt.step()
    
        opt.zero_grad(set_to_none=True)
        J.backward()
        opt.step()

        if i % 20 == 0:
            print(J.item()) 

        if i > max_iter:
            break
            


epochs = 1
for epoch in range(epochs):
    print("Epoch ", epoch)
    train(max_iter=1000)

Epoch  0
5.172210693359375
4.915787696838379
4.8538498878479
5.1753692626953125
5.0744547843933105
4.736363410949707
4.945469856262207
5.241998672485352
5.066843032836914
5.103704929351807
4.980006217956543
4.760480880737305
4.814445972442627
5.07875919342041
5.119041442871094
4.839418888092041
4.958718776702881
4.954410076141357
4.830076694488525
5.0022664070129395
4.921172142028809
4.722481727600098
4.558915615081787
4.770347595214844
4.54874849319458
4.726893901824951
4.735639572143555
4.409339904785156
4.56466817855835
4.341578006744385
4.418620586395264
4.308345317840576
4.400155544281006
4.572384357452393
4.38454532623291
4.264686584472656
4.5832977294921875
4.251803398132324
4.47282075881958
4.664233207702637
4.496827602386475
4.460476398468018
4.590721607208252
4.337376594543457
4.1955695152282715
4.213145732879639
4.490564346313477
4.604422092437744
4.428208351135254
4.094630718231201
4.347883701324463
