# Configure MLflow 🧐

In [1]:
import requests
from getpass import getpass
import datetime

**Set Environment Variables**


**Initialize MLflow**

In [2]:
!pip install mlflow --quiet

import mlflow
import os

**Set Local Configurations**

Under the [Token tab](https://dagshub.com/user/settings/tokens) in the user setting, copy the default token and use it here.

### Source
- https://web.stanford.edu/~jurafsky/slp3/3.pdf
- https://huyenchip.com/2023/05/02/rlhf.html#language_model

In [3]:
# REPO_OWNER = input('Repo owner: ').strip()
# REPO_NAME = input('Repo name: ').strip()


REPO_OWNER = 'lnnchau'
REPO_NAME = 'mlops-lyrics-lm'
USER_NAME = REPO_OWNER

os.environ['MLFLOW_TRACKING_USERNAME'] = USER_NAME
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')

mlflow.set_tracking_uri(f'https://dagshub.com/{REPO_OWNER}/{REPO_NAME}.mlflow')

## Language model
- GPT is a LLM (large language models)
- Language model encodes the statistical information of the language. It tells you what's likely to appear in a context.
    - Example: (find 2 examples predicting next word / fill in the blanks)
    - To train a language model, you feed it a lot of text (training data) so that it can learn the statistical information from it.
- Word-level vs Character-level?
- This notebook: Character-level


## n-gram model
### What this model do?
- Predict how likely the next word is, given n-1 preceding words in the sequence.
- For example,
    - n=2 (bigram model)
    - assume our dictionary has the following words: to, I, movies, like, watch, you, we, books
    - input: `I like to watch ____`
    - context sequence: `watch`
    - what the bigram does, is that, it'll go over all words in the dictionary, and compute how likely it is the next word - more formally, `P(word_i|watch)`

### How to train this model?
yadayadayada

#### Build the vocabulary
- Some concepts in NLP
- `Document`: text objects, which could be an article, a movie review, a passage or even a sentence.
- `Corpus`: list of documents
- `Vocabulary`: list of all the tokens in all documents. based on the task, token could be either a word, a character, or parts of the word (e.g. `playing` can be split into two tokens `play` and `ing`)

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('/content/drive/MyDrive/spotify_millsongdata.csv')
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [6]:
corpus = data.text.str.lower()
sample_document = corpus[0]

print(f'Corpus has {len(corpus)} documents')
print(f'Sample document: {sample_document}')

Corpus has 57650 documents
Sample document: look at her face, it's a wonderful face  
and it means something special to me  
look at the way that she smiles when she sees me  
how lucky can one fellow be?  
  
she's just my kind of girl, she makes me feel fine  
who could ever believe that she could be mine?  
she's just my kind of girl, without her i'm blue  
and if she ever leaves me what could i do, what could i do?  
  
and when we go for a walk in the park  
and she holds me and squeezes my hand  
we'll go on walking for hours and talking  
about all the things that we plan  
  
she's just my kind of girl, she makes me feel fine  
who could ever believe that she could be mine?  
she's just my kind of girl, without her i'm blue  
and if she ever leaves me what could i do, what could i do?




In [7]:
corpus_as_string = ' '.join(corpus.values)
vocab = set(corpus_as_string)
vocab_size = len(vocab)

print(f'Length of vocab: {vocab_size}')
print(sorted(list(vocab)))

Length of vocab: 51
['\n', '\r', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [8]:
class Tokenizer:
    def __init__(self, vocab):
        # create a mapping from characters to integers
        self.stoi = { ch:i for i,ch in enumerate(vocab) }
        self.itos = { i:ch for i,ch in enumerate(vocab) }

    def encode(self, s):
        # encoder: take a string, output a list of integers
        return [self.stoi[c] for c in s]

    def decode(self, l):
        # decoder: take a list of integers, output a string
        return ''.join([self.itos[i] for i in l])

tokenizer = Tokenizer(vocab)
print(tokenizer.encode("hii there"))
print(tokenizer.decode(tokenizer.encode("hii there")))

[48, 14, 14, 16, 2, 48, 32, 38, 32]
hii there


In [9]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(vocab) }
itos = { i:ch for i,ch in enumerate(vocab) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[48, 14, 14, 16, 2, 48, 32, 38, 32]
hii there


In [10]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(corpus_as_string), dtype=torch.long)
print(data.shape, data.dtype)
# print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([70426172]) torch.int64


#### Create dataset
- train_size: 90%
- val_size: 105
- seq_len (block size for now): 8
    - what is the maximum context length for predictions?
- batch_size = 4 # how many independent sequences will we process in parallel?


In [11]:
# Let's now split up the data into train and validation sets
n = int(0.8*len(data)) # first 90% will be train, rest val

train_ratio, val_ratio, test_ratio = 0.9, 0.05, 0.05

train_size = int(train_ratio * len(data))
val_size = int(val_ratio * len(data))
test_size = int(test_ratio * len(data))

train_data = data[:train_size]
val_data = data[train_size:train_size+val_size]
test_data = data[-test_size:]

assert len(train_data) == train_size
assert len(val_data) == val_size
assert len(test_data) == test_size

In [12]:
block_size = 8
train_data[:block_size+1]

tensor([41,  0,  0, 36, 16, 28,  2, 16, 48])

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([41]) the target: 0
when input is tensor([41,  0]) the target: 0
when input is tensor([41,  0,  0]) the target: 36
when input is tensor([41,  0,  0, 36]) the target: 16
when input is tensor([41,  0,  0, 36, 16]) the target: 28
when input is tensor([41,  0,  0, 36, 16, 28]) the target: 2
when input is tensor([41,  0,  0, 36, 16, 28,  2]) the target: 16
when input is tensor([41,  0,  0, 36, 16, 28,  2, 16]) the target: 48


In [14]:

torch.manual_seed(1337)

def get_batch(split, batch_size, block_size):
    '''
    Generate a small batch of data of inputs x and targets y

    return: x, y
        - x: (batch_size, block_size)
        - y: (batch_size, block_size)
    '''
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

get_batch('train', 4, 8)


(tensor([[44, 32, 19, 16,  5,  0,  2, 16],
         [ 9, 16, 49, 28,  5, 14, 44, 16],
         [32, 16, 17, 28, 41, 41,  9, 16],
         [16, 30,  0, 46, 38, 16, 28, 38]]),
 tensor([[32, 19, 16,  5,  0,  2, 16,  2],
         [16, 49, 28,  5, 14, 44, 16, 10],
         [16, 17, 28, 41, 41,  9, 16, 16],
         [30,  0, 46, 38, 16, 28, 38, 49]]))

#### Modeling: Simple BigramLanguageModel

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (batch_size,seq_len) tensor of integers

        logits = self.token_embedding_table(idx) # (batch_size,seq_len,vocab_size)
        
        if targets is None:
            loss = None
        else:
            batch_size, seq_len, vocab_size = logits.shape

            # example: we have 2 classes [0, 1]
            # logits = [[0.5, 0.5], [0.3, 0.7], [0.6, 0.4]]
            # targets = [0, 1, 0]

            logits = logits.view(batch_size*seq_len, vocab_size)
            targets = targets.view(batch_size*seq_len)
  
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (batch_size, vocab_size)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (batch_size, vocab_size)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    @torch.no_grad()
    def evaluate_ppl(self, seq_tensor):
        # Implemented from https://web.stanford.edu/~jurafsky/slp3/3.pdf
        logits, _ = self(seq_tensor)    # logits = (batch_size, seq_len, vocab_size)

        batch_size, seq_len, vocab_size = logits.shape

        logits = logits.view(batch_size*seq_len, vocab_size)
        
        logits = logits[:-1, :] # to compute P(x_i|x_(i-1))
        probs = F.softmax(logits, dim=-1)
        
        ground_truths = seq_tensor.view(batch_size*seq_len)[1:]
        ppl = probs[
            torch.arange(batch_size*seq_len - 1),
            ground_truths]

        return torch.exp(ppl.mean())

In [16]:
class LyricsGenerator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def get_lyrics(self, start_phrase, max_new_tokens=2000):
        start_phrase_as_ids = self.tokenizer.encode(start_phrase)
        context = torch.tensor(start_phrase_as_ids, dtype=torch.long, device=self.device).reshape(1, -1)
        output_tokens = self.model.generate(idx=context, max_new_tokens=max_new_tokens)[0].tolist()

        return self.tokenizer.decode(output_tokens)

In [17]:
class Config:
    def __init__(self,
                 batch_size,
                 num_iterations,
                 lr,
                 vocab_size,
                 block_size):
        self.batch_size = batch_size
        self.num_iterations = num_iterations
        self.lr = lr
        self.vocab_size = vocab_size
        self.block_size = block_size

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

def train(model_class, config: Config):
    mlflow.set_experiment_tag('architecture', model_class.__name__)
    mlflow.log_params(config.__dict__)

    m = model_class(config.vocab_size)
    m.to(config.device)

    optimizer = torch.optim.AdamW(m.parameters(), lr=config.lr)

    for step in range(config.num_iterations):
        # sample a batch of data
        xb, yb = get_batch(
            'train',
            batch_size=config.batch_size,
            block_size=config.block_size)
        xb, yb = xb.to(config.device), yb.to(config.device)

        # evaluate the loss
        logits, loss = m(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        mlflow.log_metric('loss', loss.item(), step=step)

    m.to("cpu")
    mlflow.pytorch.log_model(m, "model")

    return m

def evaluate_test(model, test_data):
    model.eval()

    model = model.to(device)
    test_data = test_data.to(device)
    ppl = model.evaluate_ppl(test_data.view(1, -1))

    mlflow.log_metric('ppl', ppl)
    return ppl


In [18]:
import pickle
import time

#### Modeling: Attention

In [19]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 50
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size, block_size)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModelAttention(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    @torch.no_grad()
    def evaluate_ppl(self, seq_tensor):
        _, test_size = seq_tensor.shape
        num_chunks = test_size // block_size

        # update test size to fit with block_size
        test_size = num_chunks * block_size
        seq_tensor = seq_tensor[:, :test_size]

        # split seq_tensor into chunks of blocksize
        test_input = seq_tensor.view(num_chunks, block_size)

        # Implemented from https://web.stanford.edu/~jurafsky/slp3/3.pdf
        logits, _ = self(test_input)    # logits = (batch_size, seq_len, vocab_size)

        batch_size, seq_len, vocab_size = logits.shape

        logits = logits.view(test_size, vocab_size)
        logits = logits[:-1, :] # to compute P(x_i|x_(i-1))
        probs = F.softmax(logits, dim=-1)
        
        # TODO: write docs to explain this part
        ground_truths = test_input.view(test_size)[1:]
        ppl = probs[
            torch.arange(test_size - 1),
            ground_truths]

        return torch.exp(ppl.mean())

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [20]:
import pickle
import time

tokenizer = Tokenizer(vocab)
tokenizer_path = 'tokenizer.pkl'

with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)

with mlflow.start_run(run_name=str(time.time())):
    model_class = BigramLanguageModelAttention
    mlflow.set_experiment_tag('architecture', model_class.__name__)

    model = model_class()
    model = model.to(device)
    # print the number of parameters in the model
    print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    model.train()

    for iter in range(max_iters):
        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # sample a batch of data
        xb, yb = get_batch('train', batch_size, block_size)
        xb, yb = xb.to(device), yb.to(device)

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    model = model.to("cpu")
    mlflow.pytorch.log_model(model, "model")

    torch.cuda.empty_cache()

    test_data = data[-test_size:]
    evaluate_test(model, test_data)

    mlflow.log_artifact(tokenizer_path, "model/artifacts")


0.207923 M parameters
step 0: train loss 4.0454, val loss 4.0417
step 50: train loss 2.5634, val loss 2.5490
step 100: train loss 2.4028, val loss 2.3996
step 150: train loss 2.3235, val loss 2.3084
step 200: train loss 2.2427, val loss 2.2291
step 250: train loss 2.1793, val loss 2.1540
step 300: train loss 2.1299, val loss 2.1193
step 350: train loss 2.0988, val loss 2.0845
step 400: train loss 2.0593, val loss 2.0535
step 450: train loss 2.0357, val loss 2.0230
step 500: train loss 2.0037, val loss 1.9843
step 550: train loss 1.9639, val loss 1.9430
step 600: train loss 1.9216, val loss 1.9276
step 650: train loss 1.9052, val loss 1.9038
step 700: train loss 1.8869, val loss 1.8704
step 750: train loss 1.8807, val loss 1.8613
step 800: train loss 1.8454, val loss 1.8418
step 850: train loss 1.8248, val loss 1.8211
step 900: train loss 1.8214, val loss 1.8071
step 950: train loss 1.8071, val loss 1.7952




step 999: train loss 1.7869, val loss 1.7788




#### ARCHIVE

In [None]:
batch_size = 128
num_iterations = 1000
lr=1e-3

config = Config(
    batch_size=batch_size,
    num_iterations=num_iterations,
    lr=lr,
    vocab_size=vocab_size,
    block_size=block_size
)

test_data = data[-test_size:]

tokenizer = Tokenizer(vocab)
tokenizer_path = 'tokenizer.pkl'

with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)

with mlflow.start_run(run_name=str(time.time())):
    model = train(BigramLanguageModel, config)
    evaluate_test(model, test_data)

    mlflow.log_artifact(tokenizer_path, "model/artifacts")

# lyrics_gen = LyricsGenerator(model, tokenizer)
# lyrics_gen.get_lyrics("last christmas i gave you my heart", 500)