<a href="https://colab.research.google.com/github/nancii-jif/305b_final/blob/main/hw4_mini_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0.0 Setup

In [2]:
# torch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
from tqdm import tqdm

import requests
import os

torch.manual_seed(305)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

We set default values for some global hyperparameters, but feel free to change these during development as needed.

In [3]:
# Global hyperparameters
SMALL_ITERS = 1000
LARGE_ITERS = 2000
EVAL_ITERS = 100
CONTEXT_WINDOW_SIZE = 128

In [4]:
# download the tiny shakespeare dataset
input_file_path = 'input.txt'

if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

length of dataset in characters: 1,115,394


### 0.1 Helper Functions

In [5]:
# function for getting batches of data
def get_batch(split, context_window_size, device, batch_size=32, token_type='morf'):
    """
    generate a small batch of data of inputs x and targets y

    Args:
        split: 'train' or 'val'
        device: 'cpu' or 'cuda' (should be 'cuda' if available)
    """
    if token_type == 'bpe':
        data = train_data_bpe if split == 'train' else val_data_bpe
    elif token_type == 'morf':
        data = train_data_morf if split == 'train' else val_data_morf
    else:
      data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - context_window_size, (batch_size,))
    x = torch.stack([data[i:i+context_window_size] for i in ix])
    y = torch.stack([data[i+1:i+context_window_size+1] for i in ix])
    # x_labels = torch.stack([train_labels[i:i+context_window_size] for i in ix])
    # y_labels = torch.stack([train_labels[i+1:i+context_window_size+1] for i in ix])
    # x = torch.cat((x, x_labels), dim=2)
    # y = torch.cat((y, y_labels), dim=2)
    x = x.to(device)
    y = y.to(device)
    # x_labels = x_labels.to(device)
    # y_labels = y_labels.to(device)
    return x, y

# helper function for tracking loss during training
# given to you
@torch.no_grad()
def estimate_loss(model, eval_iters, context_window_size, device, token_type='morf'):
    """
    Args:
      model: model being evaluated
      eval_iters: number of batches to average over
      context_window_size: size of the context window
      device: 'cpu' or 'cuda' (should be 'cuda' if available)
    """
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, context_window_size, device, token_type=token_type)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

In [6]:
class dataLoader:
  def __init__(self, data, token_type='bpe', tokenizer_model=None):
    self.data = data
    self.tokenizer_model = tokenizer_model
    self.n = len(data)
    self.vocab = sorted(set(self.data))
    if not tokenizer_model:
      self.vocab_size = len(self.vocab)
    else:
      self.vocab_size = tokenizer_model.vocab_size()
    self.stoi = { seg:i for i,seg in enumerate(self.vocab) }
    self.itos = { i:seg for i,seg in enumerate(self.vocab) }
    self.token_type = token_type
    self.tokenizer_model = tokenizer_model
    self.train_data = None
    self.val_data = None

  def encoder(self, tokens):
    return [self.stoi[t] for t in tokens]

  def decoder(self, indices):
    return ''.join([self.itos[i] for i in indices])

  def get_train_val(self, train_test_split=.9):
    train_data_raw = self.data[:int(self.n*0.9)]
    val_data_raw = self.data[int(self.n*0.9):]
    if self.tokenizer_model is not None:
      self.train_data = torch.tensor(self.tokenizer_model.encode(train_data_raw, out_type=int))
      self.val_data = torch.tensor(self.tokenizer_model.encode(val_data_raw, out_type=int))
    else:
      self.train_data = torch.tensor(self.encoder(train_data_raw, out_type=int))
      self.val_data = torch.tensor(self.encoder(val_data_raw, out_type=int))
    return self.train_data, self.val_data



### 0.2 Evaluations

In [None]:
!git clone https://github.com/google-research/bleurt.git
%cd bleurt
!pip install -r requirements.txt
!pip install .

In [None]:
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
!unzip BLEURT-20.zip

from bleurt import score
bleurt_scorer = score.BleurtScorer("BLEURT-20")

In [15]:
import pandas as pd
def eval_context_generation(contexts, targets, tlm=None, tokenizer_model=None, token_type='morf', scorer=None):
  cond_gens = []
  bleurt_scores = []
  for i, context in enumerate(contexts):
    if token_type == 'bpe' and tokenizer_model is not None:
      context_tokens = torch.tensor(tokenizer_model.encode(context, out_type=int), device=device).reshape(1, -1)
      cond_gen = (tlm.generate(context_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
      cond_gens.append(tokenizer_model.decode(cond_gen, out_type='str'))


    elif token_type == 'morf' and tokenizer_model is not None:
      context_cleaned = TOKEN_RE.findall(context)
      context_tokens = []
      for word in context_cleaned:
        if word.isalpha():
          segs, _ = tokenizer_model.viterbi_segment(word)
          context_tokens.extend(s for s in segs)
        else:
          context_tokens.append(word) # non-letter char
      context_tokens = torch.tensor(dataloader_morf.encode(context_tokens), device=device).reshape(1, -1)
      cond_gen = (tlm.generate(context_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
      cond_gens.append(dataloader_morf.decode(cond_gen))
    if scorer is not None:
      bleurt_score = scorer.score(references=[targets[i]], candidates=[cond_gen])
      bleurt_scores.append(bleurt_score)
    return_df = pd.DataFrame({'context': contexts, 'target': targets, 'cond_gen': cond_gens})
    return return_df, bleurt_scores

In [14]:
from bleurt import score
bleurt_scorer = score.BleurtScorer("BLEURT-20")
ref = """So Romeo would, were he not Romeo call'd,
Retain that dear perfection which he owes
Without that title. Romeo, doff thy name,
And for that name which is no part of thee
Take all myself."""
gen1 = """Which is grust, be mind will what is arms:
I am shall'd enter he's raised up fortune
Depose upon that up the vows their isonace ailing,
And trust by the death thrals up with his sshour
Acquainful for Buckingham.

MERCUTIO:
Then chast that marry here!
I co
"""
gen2 = """Isum, i, and by to find better Tower.
Well, a lord, that I know this man's king.
Soft and here art thou split'st Romeo, man of my sad
My bounty of state and Dick, he sought to signify
And aid those that with no cause may contain out all
For though not cause were they chosen, our"""
bleurt_score = bleurt_scorer.score(references=[ref], candidates=[gen2])
print(bleurt_score)

[0.26428818702697754]


In [16]:
def graph_loss_and_perplexity(train_losses, val_losses, train_perps, val_perps, transformer_models=['vanilla', 'bpe', 'morf', 'bias']):
    """
    Plots training/validation loss and perplexity for multiple models.

    Args:
        train_losses: list of lists, shape (num_models, num_iters)
        val_losses: list of lists, same
        train_perps: list of lists, same
        val_perps: list of lists, same
        transformer_models: list of model names, same order as data
    """

    plt.figure(figsize=(14, 5))

    # --- Loss Plot ---
    plt.subplot(1, 2, 1)
    for i, model in enumerate(transformer_models):
        plt.plot(train_losses[i], label=f"{model} - train", linestyle='-')
        plt.plot(val_losses[i], label=f"{model} - val", linestyle='--')
    plt.title("train and val loss across models")
    plt.xlabel("iter")
    plt.ylabel("loss")
    plt.legend()
    plt.grid(True)

    # --- Perplexity Plot ---
    plt.subplot(1, 2, 2)
    for i, model in enumerate(transformer_models):
        plt.plot(train_perps[i], label=f"{model} - train", linestyle='-')
        plt.plot(val_perps[i], label=f"{model} - val", linestyle='--')
    plt.title("train and val perplexity across models")
    plt.xlabel("iter")
    plt.ylabel("loss")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

## 1 Baselines

### 1.0 Vanilla Transformer (character-level tokenization)

In [None]:
# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

### 1.1 Byte Pair Encoding

In [7]:
import sentencepiece as spm

In [8]:
spm.SentencePieceTrainer.Train(input='input.txt',
                               model_prefix='bpe_baseline',
                               vocab_size=3000,
                               model_type='bpe',
                               character_coverage=1.0,
                               add_dummy_prefix=False,
                               user_defined_symbols = ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?'])
sp = spm.SentencePieceProcessor()
sp.load('bpe_baseline.model')

True

In [9]:
data_loader_bpe = dataLoader(data, token_type='bpe', tokenizer_model=sp)
train_data_bpe, val_data_bpe = data_loader_bpe.get_train_val()
vocab_size_bpe = data_loader_bpe.vocab_size
print(f"train has {len(train_data_bpe):,} tokens")
print(f"val has {len(val_data_bpe):,} tokens")

train has 325,421 tokens
val has 37,583 tokens


In [10]:
bpe_vocab_size = sp.vocab_size()
n = len(data)
train_chars = data[:int(n*.9)]
val_chars = data[int(n*.9):]
train_data_bpe = torch.tensor(sp.encode(train_chars, out_type=int))
val_data_bpe = torch.tensor(sp.encode(val_chars, out_type=int))
print(f"train has {len(train_data_bpe):,} tokens")
print(f"val has {len(val_data_bpe):,} tokens")

train has 325,421 tokens
val has 37,583 tokens


#### BPE Transformer (=Vanilla)

In [11]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, context_window_size, embed_size=384):
        """
        Args:
          head_size: int, size of the head embedding dimension (K)
          context_window_size: int, number of tokens considered in the past for attention (T)
          embed_size: int, size of the token embedding dimension (D)
        """
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(embed_size, head_size, bias=False) # query
        self.query = nn.Linear(embed_size, head_size, bias=False) # key
        self.value = nn.Linear(embed_size, embed_size, bias=False)

        # not a param of the model, so registered as a buffer
        self.register_buffer('tril', torch.tril(
            torch.ones(context_window_size, context_window_size)))

    def forward(self, x):
        """
        Args:
          x: (B,T,D) tensor of token embeddings

        Returns:
          (B,T,D) tensor of attention-weighted token embeddings
        """
        _, T, _ = x.size()
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        attn_scores = q @ k.transpose(-2, -1)
        masked_scores = attn_scores.masked_fill(self.tril == 0, float('-inf'))
        masked_scores = masked_scores / self.head_size ** .5

        attn = F.softmax(masked_scores, dim=-1) @ v
        return attn

In [12]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, context_window_size, num_heads, head_size, embed_size=384):
        """
        Args:
            context_window_size: int, number of tokens considered in the past for attention (T)
            num_heads: int, number of heads (H)
            head_size: int, size of the head embedding dimension
            embed_size: int, size of the token embedding dimension
        """
        super().__init__()
        # TODO, your code below
        self.head_size = head_size
        self.heads = nn.ModuleList([Head(head_size, context_window_size, embed_size) for _ in range(num_heads)])
        self.linear = nn.Linear(num_heads * embed_size, embed_size)

    def forward(self, x):
        attn_list = [head.forward(x) for head in self.heads]
        mhsa = torch.cat(attn_list, dim=-1)
        mhsa = self.linear(mhsa)
        return mhsa

In [13]:
# run this cell to initialize this deep learning module that you should use in the code your write later
# you don't need to edit this layer
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity
        Given to you, you don't need to write any code here!
    """

    def __init__(self, embed_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size),
        )

    def forward(self, x):
        return self.net(x)

In [14]:
class TransformerBlock(nn.Module):
    """ Transformer block: communication across sequence length, followed by communication across embedding space
        Uses multi-headed attention
    """

    def __init__(self, vocab_size, context_window_size, embed_size=384, num_heads=6):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

        # TODO: your code below
        head_size = embed_size // num_heads
        self.feed_forward = FeedForward(embed_size)
        self.atten_heads = MultiHeadAttention(context_window_size, num_heads, head_size, embed_size)


    def forward(self, x):
        x = x + self.atten_heads(self.ln1(x)) # communication over sequence length
        x = x + self.feed_forward(self.ln2(x)) # communication across embedding space
        return x

In [15]:
class TransformerLM(nn.Module):

    def __init__(self, vocab_size, context_window_size, embed_size=384, num_heads=6, n_layers=6):
        """
          Args:
              vocab_size: int, number of tokens in the vocabulary (V)
              context_window_size: int, size of the context window (T)
              embed_size: int, embedding size (D)
              num_heads: int, number of heads (H)
              n_layers: int, number of layers (M)
        """
        super().__init__()
        self.vocab_size = vocab_size
        self.context_window_size = context_window_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.position_embedding_table = nn.Embedding(context_window_size, embed_size)
        self.blocks = nn.Sequential(*[
            TransformerBlock(vocab_size,
                             context_window_size,
                             embed_size=embed_size,
                             num_heads=num_heads)
            for _ in range(n_layers)])

        # final layer norm
        self.ln_f = nn.LayerNorm(embed_size)
        self.lm_head = nn.Linear(embed_size, vocab_size)

        # good initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, token_ids, targets=None):
        """
        Agrgs:
            token_ids: tensor of integers, provides the contet, shape (B, T)
            targets: tensor of integers, provides the tokens we are preidcitng, shape (B, T)
        """
        B, T = token_ids.shape

        # token_ids and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(token_ids) # (B, T, D)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, D)
        x = tok_emb + pos_emb # (B, T, D)

        # TODO: your code below
        blocked = self.blocks(x)
        normalized = self.ln_f(blocked)
        logits = self.lm_head(normalized)

        if targets is None:
            loss = None
        else:
            logits_flat = logits.view(-1, logits.size(-1))
            targets_flat = targets.view(-1)
            loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
            token_ids: tensor of integers forming the context, shape (B, T)
            max_new_tokens: int, max number of tokens to generate
        """
        # TOOD, your code below
        T = token_ids.size(-1)
        curr_T = T
        for _ in range(max_new_tokens):
          token_ids_subset = token_ids[:, -self.context_window_size:]
          logits, _ = self.forward(token_ids_subset)
          logits = logits[:, -1, :] # for each entry in the batch, gets the last token
          probs = F.softmax(logits, dim=-1)
          next_token = torch.multinomial(probs, num_samples=1) # sample next token
          token_ids = torch.cat((token_ids, next_token), dim=1)
        return token_ids

In [16]:
trans_bpe = TransformerLM(vocab_size_bpe, CONTEXT_WINDOW_SIZE)
tlm_bpe = trans_bpe.to(device)
learning_rate = 5e-4
optimizer = torch.optim.Adam(trans_bpe.parameters(), lr=learning_rate)
eval_interval = 100
loss_list = []
losses_dict = {'train' : [], 'val' : []}
perplexities = { 'train' : [], 'val' : [] }

for it in tqdm(range(LARGE_ITERS)):

    # every once in a while evaluate the loss on train and val sets
    if it % eval_interval == 0 or it == LARGE_ITERS - 1:
        print(f"iteration {it}")
        losses = estimate_loss(tlm_bpe, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device, token_type='bpe')
        perplexities['train'].append(torch.exp(losses['train']))
        perplexities['val'].append(torch.exp(losses['val']))
        losses_dict['train'].append(losses['train'])
        losses_dict['val'].append(losses['val'])
        print(f"step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, token_type='bpe')

    # evaluate the loss
    logits, loss = tlm_bpe(xb, yb)
    loss_list.append(loss.detach().item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/2000 [00:00<?, ?it/s]

iteration 0
step 0: train loss 8.1313, val loss 8.1321


 10%|█         | 200/2000 [00:49<05:47,  5.19it/s]

iteration 200


 10%|█         | 201/2000 [01:04<2:17:49,  4.60s/it]

step 200: train loss 4.9446, val loss 5.0268


 20%|██        | 400/2000 [01:44<05:23,  4.94it/s]

iteration 400


 20%|██        | 401/2000 [01:58<1:54:48,  4.31s/it]

step 400: train loss 4.5320, val loss 4.7117


 30%|███       | 600/2000 [02:37<04:35,  5.09it/s]

iteration 600


 30%|███       | 601/2000 [02:51<1:41:52,  4.37s/it]

step 600: train loss 4.1970, val loss 4.5186


 40%|████      | 800/2000 [03:31<03:53,  5.14it/s]

iteration 800


 40%|████      | 801/2000 [03:45<1:26:19,  4.32s/it]

step 800: train loss 3.9392, val loss 4.4183


 50%|█████     | 1000/2000 [04:25<03:25,  4.86it/s]

iteration 1000


 50%|█████     | 1001/2000 [04:39<1:12:23,  4.35s/it]

step 1000: train loss 3.6689, val loss 4.3252


 60%|██████    | 1200/2000 [05:19<02:37,  5.08it/s]

iteration 1200


 60%|██████    | 1201/2000 [05:33<57:40,  4.33s/it]

step 1200: train loss 3.4021, val loss 4.3377


 70%|███████   | 1400/2000 [06:12<01:57,  5.09it/s]

iteration 1400


 70%|███████   | 1401/2000 [06:26<43:15,  4.33s/it]

step 1400: train loss 3.0929, val loss 4.3715


 80%|████████  | 1600/2000 [07:06<01:17,  5.15it/s]

iteration 1600


 80%|████████  | 1601/2000 [07:20<28:48,  4.33s/it]

step 1600: train loss 2.7547, val loss 4.4441


 90%|█████████ | 1800/2000 [07:59<00:38,  5.14it/s]

iteration 1800


 90%|█████████ | 1801/2000 [08:13<14:21,  4.33s/it]

step 1800: train loss 2.3030, val loss 4.6362


100%|█████████▉| 1999/2000 [08:52<00:00,  5.06it/s]

iteration 1999


100%|██████████| 2000/2000 [09:06<00:00,  3.66it/s]

step 1999: train loss 1.8395, val loss 4.9010





In [None]:
perplexities_bpe = perplexities
losses_bpe = losses_dict
graph_loss_and_perplexity(losses_bpe['train'], losses_bpe['val'], perplexities_bpe['train'], perplexities_bpe['val'], transformer_models=['bpe'])


In [3]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined

In [19]:
context1 = """JULIET:
O Romeo, Romeo! wherefore art thou Romeo?
Deny thy father and refuse thy name;
Or, if thou wilt not, be but sworn my love,
And I'll no longer be a Capulet.

ROMEO:

JULIET:
'Tis but thy name that is my enemy;
Thou art thyself, though not a Montague.
What's Montague? it is nor hand, nor foot,
Nor arm, nor face, nor any other part
Belonging to a man. O, be some other name!
What's in a name? that which we call a rose
By any other name would smell as sweet;"""

context1_tokens = torch.tensor(sp.encode(context1, out_type=int), device=device).reshape(1, -1)
cond_gen = (tlm_bpe.generate(context1_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
print(sp.decode(cond_gen, out_type=str))


JULIET:
O Romeo, Romeo! wherefore art thou Romeo?
Deny thy father and refuse thy name;
Or, if thou wilt not, be but sworn my love,
And I'll no longer be a Capulet.

ROMEO:

JULIET:
'Tis but thy name that is my enemy;
Thou art thyself, though not a Montague.
What's Montague? it is nor hand, nor foot,
Nor arm, nor face, nor any other part
Belonging to a man. O, be some other name!
What's in a name? that which we call a rose
By any other name would smell as sweet;
Which, rather in the first I respect not this
To harkic body in it what I can?
We talk in secret, in my lady and very comforts.
Come, come, Bobhy, go in, is't. Howe'd your
swecover acquire, he shall be good goper: the trespass of my capital
The part of your knaves slaves or my sepity.

QUEEN ELIZABETH:
Come, Clarence, thou art must not think.

GLOUCESTER:
What is your grace hearing I will do


#### Question 1.4.3: Generating text!

Now with our trained model, we can generate some text that is somewhat like the style of Shakespeare! Below we will do both unconditional and conditional generation.

In [None]:
# unconditional generation from the model
start_context = torch.zeros((1, 256), dtype=torch.long, device=device)
uncond_gen = (tlm.generate(start_context, max_new_tokens=50)[0].tolist())
print(decode(uncond_gen))

In [22]:
# conditional generation from the model

context1 = """ROMEO:
He jests at scars that never felt a wound.
But, soft! what light through yonder window breaks?
It is the east, and Juliet is the sun.
Arise, fair sun, and kill the envious moon,
Who is already sick and pale with grief,
That thou her maid art far more fair than she:
Be not her maid, """

context1_tokens = torch.tensor(encode(context1), device=device).reshape(1, -1)

In [23]:
cond_gen = (tlm.generate(context1_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
print(decode(cond_gen))

ROMEO:
He jests at scars that never felt a wound.
But, soft! what light through yonder window breaks?
It is the east, and Juliet is the sun.
Arise, fair sun, and kill the envious moon,
Who is already sick and pale with grief,
That thou her maid art far more fair than she:
Be not her maid, and own poison or fals.
Orread it towardly, thy liege.

MENENIUS:
He is: my brother; I be deepise thee.
Let me to gove you and t


TODO: Choose your own context from Shakespeare, and perform conditional generation from that text. Does this look reasonable to you? Why or why not?

In [24]:
context_is = """JULIET:
'Tis but thy name that is my enemy;
Thou art thyself, though not a Montague.
What's Montague? it is nor hand, nor foot,
Nor arm, nor face, nor any other part
Belonging to a man. O, be some other name!
What's in a name? that which we call a rose
By any other name would smell as sweet;"""

target_is = """So Romeo would, were he not Romeo call'd,
Retain that dear perfection which he owes
Without that title. Romeo, doff thy name,
And for that name which is no part of thee
Take all myself."""

context_is_tokens = torch.tensor(encode(context_is), device=device).reshape(1, -1)

In [31]:
cond_gen_1 = (tlm.generate(context_is_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
print(decode(cond_gen_1))

JULIET:
'Tis but thy name that is my enemy;
Thou art thyself, though not a Montague.
What's Montague? it is nor hand, nor foot,
Nor arm, nor face, nor any other part
Belonging to a man. O, be some other name!
What's in a name? that which we call a rose
By any other name would smell as sweet;
That I was naise Mirtuous enames myself
We were foolixment, and marrial be'Aut and
His cruel forgive fatered, and dishonours an


In [29]:
context_oos = """EMILIA  Pray you say nothing, pray you.
Who cannot feel nor see the rain, being in 't,
Knows neither wet nor dry. If that you were
The groundpiece of some painter, I would buy you
T' instruct me 'gainst a capital grief-indeed,
Such heart-pierced demonstration. But, alas,
Being a natural sister of our sex,"""

target_oos = """Your sorrow beats so ardently upon me
That it shall make a counter-reflect 'gainst
My brother's heart and warm it to some pity,
Though it were made of stone. Pray have good
comfort."""

context_oos_tokens = torch.tensor(encode(context_oos), device=device).reshape(1, -1)

In [30]:
cond_gen_2 = (tlm.generate(context_oos_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
print(decode(cond_gen_2))

EMILIA  Pray you say nothing, pray you.
Who cannot feel nor see the rain, being in 't,
Knows neither wet nor dry. If that you were
The groundpiece of some painter, I would buy you
T' instruct me 'gainst a capital grief-indeed,
Such heart-pierced demonstration. But, alas,
Being a natural sister of our sex,
A man'twabaur's told and misstate
As so mine a world down your soul tranion!
Cry from mere own, then a sid
Decouraclan silits P


---

_your answer here_

---

#### Question 1.4.4

The negative log-likelihood (averaged per token) we have been using to train our models can be expressed as
\begin{equation*}
  L = -\frac{1}{T} \sum_{t = 1}^{T} \log p(s[t] | \text{context})
\end{equation*}
for some document $s$, where $s[t]$ is the $t$th token of the doc. The natural language processing (NLP) community often reports the quantity
\begin{equation*}
  \text{perplexity} = \exp(L).
\end{equation*}

Give an intuitive interpretation of what perplexity is. Why might it be a more intuitive or natual measure to report than negative log-likelihood? Does the reported perplexity of your trained `TransformerLM` model make sense in terms of samples it generates? (Be sure to distinguish betwen `train` and `validation` perplexity. Which of `train` and `val` perplexity is more helpful for understanding your generated samples? Why?). (*Hint: your answer to Question 1.1.6 may be helpful*).

## Part 2: Mini-Project

Quick recap: So far we have

1. Preprocessed the Shakespeare dataset by encoding individual characters into integer tokens.
2. Implemented single headed attention and then further generalized to multiheaded attention. We further combined multiheaded attention with deep learning to create the transformer architecture.
3. Trained our transformer and generated output that looks to be in the style of Shakespeare.

Up to this point, the performance of our simple language model has clearly made a lot of progress. We can see that our model has learned to generate text that is close to the style of Shakespeare, although there are still many quirks and room for improvement.

### Project Outline

Find some area of possible improvement.
We interpret "improvement" quite loosely, but please state precisely why your proposed innovation might improve the model, and provide evidence that it does (or does not!) improve.
For your idea, **formulate a hypothesis** for why this change should result in a better model. **Implement your changes** and **report any findings**.

_Notes_: As this assignment is being treated as a project, you should expect training to take longer than previous assignments. However, please use your judgement to decide what is reasonable. We will not expect you to run training procedures that take more than 2 hours on the free Google Colab computing resources and we certainly do not expect you to acquire additional compute. The proposed improvements should not solely rely on increased computing demands.

_Hints_: There are many aspects to assessing a model. For example, not only is quality of generated text important, it is also of interest to reduce costs associated with training.

### Deliverables

In addition to a pdf of your python notebook, the submission for this project will be a written report no more than 4 pages in length using the [NeurIPS LaTex template](https://neurips.cc/Conferences/2023/PaperInformation/StyleFiles). Your report should include detailed analysis of the hypotheses you chose to test along with any conclusions.

The page limit for the report does not include bibliography or appendices. Make sure to keep the "ready for submission" option to help us grade anonymously. Your writeup should also contain a link to any code used to generate the project so that we can reference it while grading (Google Drive folder with colab notebooks or Github repo are both fine). You should have at least one plot in your main text (which is capped at 4 pages).

ideas:


**morphology awareness**
1. positional embedding might not be that helpful since ancient english grammar has more granular subject and pronoun agreement rules (for instance, "love you me" cannot distinguish between the subject and noun whereas "lovest thou me" specifies that it's "you" who loves "me" due to subject-verb agreement), so a better metric for evaluation of the output could be whether the model learns and applies these agreement rules appropriately
2. more transparent morpheme usage: early modern english due to latin and french influence, often employed derivational morphemes that remain etymologically transparent, such as "methinks, consort, dissuade, etc" and prefixes like "en-, be-, dis-" and suffixes like "-ment, -tion, -ness" were more widely and efficiently used in spoken language which is the main form of text in Shakespeare plays

==> therefore we want to use a more morpheme-aware encoding to see if the model can captures these grammatical rules better

**fails to capture main idea of paragraph**


**TODO**


libraries: Morfessor, stanford's Stanza,


BME paper

### 1. segmenter training

## 0. PROJECT SETUP

In [32]:
# download the tiny shakespeare dataset
input_file_path = 'input.txt'

if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

length of dataset in characters: 1,115,394


In [33]:
# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


## 1. Baseline Morfessor

In [34]:
pip install morfessor



In [35]:
from collections import Counter
import morfessor
from morfessor import MorfessorIO
import re
import math

morf = morfessor.BaselineModel()

TOKEN_RE = re.compile(r"\w+|[^\w\s]|\s", re.UNICODE)
data_re = TOKEN_RE.findall(data)
words = [t for t in data_re if t.isalpha()]
words_counted = Counter(words)
# print(words_counted)

morf.load_data(
    ((c, w) for w, c in words_counted.items()), # TODO: could use log
    init_rand_split=0.3,
    freqthreshold=1
)
io = MorfessorIO()
# anno_data = io.read_annotations_file('labeled_segmentations.txt')
# print(anno_data)
# morf.set_annotations(anno_data)
# morf.set_parameters(alpha=0.5)
morf.train_batch()
print(morf.viterbi_segment("unlovable"))

[38;2;0;255;0m100%[39m [38;2;0;255;0m(13320 of 13320)[39m |##################| Elapsed Time: 0:00:03 Time:  0:00:03
[38;2;0;255;0m100%[39m [38;2;0;255;0m(13320 of 13320)[39m |##################| Elapsed Time: 0:00:02 Time:  0:00:02
[38;2;0;255;0m100%[39m [38;2;0;255;0m(13320 of 13320)[39m |##################| Elapsed Time: 0:00:02 Time:  0:00:02
[38;2;0;255;0m100%[39m [38;2;0;255;0m(13320 of 13320)[39m |##################| Elapsed Time: 0:00:02 Time:  0:00:02


(['un', 'lo', 'v', 'able'], 38.56895234781196)


In [36]:
print(morf.viterbi_segment("loveth"))

(['love', 'th'], 16.54466395389567)


In [37]:
def label_segment_role(segments, prefix_dict, suffix_dict, data_dict, num_of_roles=3):# usually
  probabilities = torch.zeros(len(segments), num_of_roles)
  for i, seg in enumerate(segments):
    if seg.isalpha():
      if seg in prefix_dict:
        probabilities[i][0] = prefix_dict[seg] / data_dict[seg]
      if seg in suffix_dict:
        probabilities[i][-1] = suffix_dict[seg] / data_dict[seg]
      if seg in data_dict:
        # print(seg, data_dict[seg], prefix_dict[seg], suffix_dict[seg])
        probabilities[i][1] = 1 - probabilities[i][0] - probabilities[i][1]
      else:
        probabilities[i] = torch.ones(num_of_roles) / num_of_roles
        # print(f'error: unknown segment {seg}')
    else:
      probabilities[i] = torch.ones(num_of_roles) / num_of_roles
  probabilities = torch.softmax(probabilities, dim=1)
  return probabilities

In [38]:
data_mo = []
prefix_dict = {}
suffix_dict = {}
root_dict = {}
for word in data_re:
    if word.isalpha():
        segs, _ = morf.viterbi_segment(word)
        if len(segs) > 1:
          if len(segs[0]) <= 4:
            if segs[0] not in prefix_dict:
              prefix_dict[segs[0]] = 0
            prefix_dict[segs[0]] += 1
          if len(segs[-1]) <= 4:
            if segs[-1] not in suffix_dict:
              suffix_dict[segs[-1]] = 0
            suffix_dict[segs[-1]] += 1
        data_mo.extend(s for s in segs)
    else:
        data_mo.append(word) # non-letter char
morphemes = sorted(set(data_mo))
morf_vocab_size = len(morphemes)
print(f'the vocab size (morphed) is {morf_vocab_size}')

the vocab size (morphed) is 7101


In [39]:
data_mo_dict = Counter(data_mo)

In [22]:
label_segment_role(['un'], prefix_dict, suffix_dict, data_mo_dict)

tensor([[0.5537, 0.2272, 0.2191]])

In [40]:
data_mo_labels = label_segment_role(data_mo, prefix_dict, suffix_dict, data_mo_dict)

In [24]:
data_mo_labels

tensor([[0.2119, 0.5761, 0.2119],
        [0.3333, 0.3333, 0.3333],
        [0.2119, 0.5761, 0.2119],
        ...,
        [0.2119, 0.5761, 0.2119],
        [0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333]])

In [41]:
data_mo_labels.to('cpu')

tensor([[0.2119, 0.5761, 0.2119],
        [0.3333, 0.3333, 0.3333],
        [0.2119, 0.5761, 0.2119],
        ...,
        [0.2119, 0.5761, 0.2119],
        [0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333]])

In [42]:
n = len(data_mo)
train_tokens = data_mo[:int(n * 0.9)]
val_tokens = data_mo[int(n * 0.9):]
train_labels = data_mo_labels[:int(n * 0.9), :]
print(len(train_tokens), len(train_labels))
print(len(val_tokens))

436886 436886
48543


compare size of morpheme set and data to confirm that the former makes sense as token level

In [55]:
# Create vocab
stoi = { seg:i for i,seg in enumerate(morphemes) }
itos = { i:seg for i,seg in enumerate(morphemes) }

# Encoding and decoding
def encode(tokens): # morephemes
    return [stoi[t] for t in tokens]

def decode(indices):
    return ''.join([itos[i] for i in indices])


train_data_mo = torch.tensor(encode(train_tokens), dtype=torch.long)
val_data_mo = torch.tensor(encode(val_tokens), dtype=torch.long)

we will first try out the new tokenizer on multihead attention LM to save compute/time

In [44]:
# function for getting batches of data
def get_batch(split, context_window_size, device, batch_size=32, token_type='morf'):
    """
    generate a small batch of data of inputs x and targets y

    Args:
        split: 'train' or 'val'
        device: 'cpu' or 'cuda' (should be 'cuda' if available)
    """
    if token_type == 'bpe':
        data = train_data_bpe if split == 'train' else val_data_bpe
    elif token_type == 'morf':
        data = train_data_mo if split == 'train' else val_data_mo
    ix = torch.randint(len(data) - context_window_size, (batch_size,))
    x = torch.stack([data[i:i+context_window_size] for i in ix])
    y = torch.stack([data[i+1:i+context_window_size+1] for i in ix])
    # x_labels = torch.stack([train_labels[i:i+context_window_size] for i in ix])
    # y_labels = torch.stack([train_labels[i+1:i+context_window_size+1] for i in ix])
    # x = torch.cat((x, x_labels), dim=2)
    # y = torch.cat((y, y_labels), dim=2)
    x = x.to(device)
    y = y.to(device)
    # x_labels = x_labels.to(device)
    # y_labels = y_labels.to(device)
    return x, y

# helper function for tracking loss during training
# given to you
@torch.no_grad()
def estimate_loss(model, eval_iters, context_window_size, device, token_type='morf'):
    """
    Args:
      model: model being evaluated
      eval_iters: number of batches to average over
      context_window_size: size of the context window
      device: 'cpu' or 'cuda' (should be 'cuda' if available)
    """
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        print(f'token type is {token_type}')
        for k in range(eval_iters):
            X, Y = get_batch(split, context_window_size, device, token_type=token_type)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

In [71]:
# Initialize model
mha_model_van = MultiHeadedAttentionLM(morf_vocab_size, CONTEXT_WINDOW_SIZE)
mha_morf_van = mha_model_van.to(device)

# create a PyTorch optimizer
learning_rate = 6e-4
optimizer = torch.optim.AdamW(mha_model_van.parameters(), lr=learning_rate)

eval_interval = 200

loss_list = []

for it in tqdm(range(SMALL_ITERS)):

    # every once in a while evaluate the loss on train and val sets
    if it % eval_interval == 0 or it == SMALL_ITERS - 1:
        print(f"iteration {it}")
        losses = estimate_loss(mha_morf_val, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device)
        print(f"step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, token_type='morf')

    # evaluate the loss
    logits, loss = mha_morf_van(xb, yb)
    loss_list.append(loss.detach().item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

NameError: name 'MultiHeadedAttentionLM' is not defined

In [43]:
torch.cuda.empty_cache()

### 2. Labels as embedding Morfessor

In [45]:
class MultiHeadedAttentionLM_labeled(nn.Module):

    def __init__(self, vocab_size, context_window_size, morf_probabilities=None, embed_size=384, num_heads=6, num_roles=3):
      super().__init__()
      self.head_size = embed_size // num_heads
      self.context_window_size = context_window_size


      self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
      self.position_embedding_table = nn.Embedding(context_window_size, embed_size)

      self.atten_head = MultiHeadAttention(context_window_size, num_heads, self.head_size, embed_size)
      self.lm_head = nn.Linear(embed_size, vocab_size)
      self.morf_probabilities = morf_probabilities # [n, 3]
      self.morf_proj = nn.Linear(num_roles, embed_size)
      # TODO: your code below


    def forward(self, token_ids, targets=None):
        """
        Args:
          token_ids: (B, T) token ids that make up the context (batch has size B, each entry in the
                     batch has length T)
          targets: (B, T) token ids corresponding to the target of each context in token_ids

        Returns:
          logits: (B, T, V), logits[b,t] gives the length V vector of logits for the next token
                  prediction in string b up to t tokens
          loss: scalar, negative log likelihood of target given context
        """
        # TODO: your code below
        B, T = token_ids.shape # (batch size, length)
        tok_emb = self.token_embedding_table(token_ids) # (B,T,D)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,D)
        # batch_prob = self.morf_probabilities[token_ids, :]
        batch_prob = self.morf_probabilities.to(token_ids.device)[token_ids]

        morf_emb = self.morf_proj(batch_prob)
        x = tok_emb + pos_emb + morf_emb# (B,T,D)

        x = self.atten_head(x)
        logits = self.lm_head(x) # (B,T,V)

        # TODO: your code here
        if targets is None:
            loss = None
        else:
            logits_flat = logits.view(-1, logits.size(-1))
            targets_flat = targets.view(-1)
            loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
          token_ids: (B, T) tensor of token ids to provide as context
          max_new_tokens: int, maximum number of new tokens to generate

        Returns:
          (B, T+max_new_tokens) tensor of context with new tokens appended
        """
        for _ in range(max_new_tokens):
            logits, _ = self.forward(token_ids)
            logits = logits[:, -1, :] # for each entry in the batch, gets the last token
            next_token = torch.argmax(logits, keepdim=True) # (B, 1)
            token_ids = torch.cat((token_ids, next_token), dim=-1)
        return token_ids

In [46]:
class TransformerLM_labeled(nn.Module):

    def __init__(self, vocab_size, context_window_size, morf_probabilities=None, embed_size=384, num_heads=6, n_layers=6, num_roles=3):
        """
          Args:
              vocab_size: int, number of tokens in the vocabulary (V)
              context_window_size: int, size of the context window (T)
              embed_size: int, embedding size (D)
              num_heads: int, number of heads (H)
              n_layers: int, number of layers (M)
        """
        super().__init__()
        self.vocab_size = vocab_size
        self.context_window_size = context_window_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.position_embedding_table = nn.Embedding(context_window_size, embed_size)
        self.blocks = nn.Sequential(*[
            TransformerBlock(vocab_size,
                             context_window_size,
                             embed_size=embed_size,
                             num_heads=num_heads)
            for _ in range(n_layers)])

        self.morf_probabilities = morf_probabilities # [n, 3]
        self.morf_proj = nn.Linear(num_roles, embed_size)

        # final layer norm
        self.ln_f = nn.LayerNorm(embed_size)
        self.lm_head = nn.Linear(embed_size, vocab_size)

        # good initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, token_ids, targets=None):
        """
        Agrgs:
            token_ids: tensor of integers, provides the contet, shape (B, T)
            targets: tensor of integers, provides the tokens we are preidcitng, shape (B, T)
        """
        B, T = token_ids.shape

        # token_ids and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(token_ids) # (B, T, D)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, D)

        batch_prob = self.morf_probabilities.to(token_ids.device)[token_ids]

        morf_emb = self.morf_proj(batch_prob)
        x = tok_emb + pos_emb + morf_emb # (B,T,D)

        # TODO: your code below
        blocked = self.blocks(x)
        normalized = self.ln_f(blocked)
        logits = self.lm_head(normalized)

        if targets is None:
            loss = None
        else:
            logits_flat = logits.view(-1, logits.size(-1))
            targets_flat = targets.view(-1)
            loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
            token_ids: tensor of integers forming the context, shape (B, T)
            max_new_tokens: int, max number of tokens to generate
        """
        # TOOD, your code below
        T = token_ids.size(-1)
        curr_T = T
        for _ in range(max_new_tokens):
          token_ids_subset = token_ids[:, -self.context_window_size:]
          logits, _ = self.forward(token_ids_subset)
          logits = logits[:, -1, :] # for each entry in the batch, gets the last token
          probs = F.softmax(logits, dim=-1)
          next_token = torch.multinomial(probs, num_samples=1) # sample next token
          token_ids = torch.cat((token_ids, next_token), dim=1)
        return token_ids

In [47]:
trans_la = TransformerLM_labeled(morf_vocab_size, CONTEXT_WINDOW_SIZE, morf_probabilities=data_mo_labels)
tlm_la = trans_la.to(device)
learning_rate = 5e-4
optimizer = torch.optim.Adam(trans_la.parameters(), lr=learning_rate)
eval_interval = 200
loss_list = []

for it in tqdm(range(LARGE_ITERS)):

    # every once in a while evaluate the loss on train and val sets
    if it % eval_interval == 0 or it == LARGE_ITERS - 1:
        print(f"iteration {it}")
        losses = estimate_loss(trans_la, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device)
        print(f"step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, token_type='morf')

    # evaluate the loss
    logits, loss = tlm_la(xb, yb)
    loss_list.append(loss.detach().item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/2000 [00:00<?, ?it/s]

iteration 0
token type is morf
token type is morf


  0%|          | 1/2000 [00:13<7:31:43, 13.56s/it]

step 0: train loss 8.5914, val loss 8.6115


 10%|█         | 200/2000 [00:54<06:04,  4.94it/s]

iteration 200
token type is morf
token type is morf


 10%|█         | 201/2000 [01:08<2:10:06,  4.34s/it]

step 200: train loss 3.4744, val loss 3.6050


 20%|██        | 400/2000 [01:48<05:26,  4.90it/s]

iteration 400
token type is morf
token type is morf


 20%|██        | 401/2000 [02:02<1:58:04,  4.43s/it]

step 400: train loss 3.3093, val loss 3.4854


 30%|███       | 600/2000 [02:43<04:43,  4.94it/s]

iteration 600
token type is morf
token type is morf


 30%|███       | 601/2000 [02:57<1:42:21,  4.39s/it]

step 600: train loss 3.0496, val loss 3.2854


 40%|████      | 800/2000 [03:37<04:01,  4.97it/s]

iteration 800
token type is morf
token type is morf


 40%|████      | 801/2000 [03:51<1:28:19,  4.42s/it]

step 800: train loss 2.8517, val loss 3.2078


 50%|█████     | 1000/2000 [04:32<03:22,  4.94it/s]

iteration 1000
token type is morf
token type is morf


 50%|█████     | 1001/2000 [04:46<1:13:44,  4.43s/it]

step 1000: train loss 2.7110, val loss 3.1579


 60%|██████    | 1200/2000 [05:27<02:42,  4.92it/s]

iteration 1200
token type is morf
token type is morf


 60%|██████    | 1201/2000 [05:41<59:00,  4.43s/it]

step 1200: train loss 2.5922, val loss 3.1489


 70%|███████   | 1400/2000 [06:21<02:01,  4.93it/s]

iteration 1400
token type is morf
token type is morf


 70%|███████   | 1401/2000 [06:36<44:18,  4.44s/it]

step 1400: train loss 2.4443, val loss 3.1671


 80%|████████  | 1600/2000 [07:16<01:20,  5.00it/s]

iteration 1600
token type is morf
token type is morf


 80%|████████  | 1601/2000 [07:30<29:37,  4.45s/it]

step 1600: train loss 2.3069, val loss 3.1663


 90%|█████████ | 1800/2000 [08:10<00:40,  4.94it/s]

iteration 1800
token type is morf
token type is morf


 90%|█████████ | 1801/2000 [08:25<14:37,  4.41s/it]

step 1800: train loss 2.1487, val loss 3.2005


100%|█████████▉| 1999/2000 [09:05<00:00,  4.94it/s]

iteration 1999
token type is morf
token type is morf


100%|██████████| 2000/2000 [09:19<00:00,  3.57it/s]

step 1999: train loss 1.9744, val loss 3.2785





In [59]:
context_is = """JULIET:
'Tis but thy name that is my enemy;
Thou art thyself, though not a Montague.
What's Montague? it is nor hand, nor foot,
Nor arm, nor face, nor any other part
Belonging to a man. O, be some other name!
What's in a name? that which we call a rose
By any other name would smell as sweet;"""

target_is = """So Romeo would, were he not Romeo call'd,
Retain that dear perfection which he owes
Without that title. Romeo, doff thy name,
And for that name which is no part of thee
Take all myself."""


In [None]:
TOKEN_RE = re.compile(r"\w+|[^\w\s]|\s", re.UNICODE)
context_is_cleaned = TOKEN_RE.findall(context_is)
context_is_mo = []
for word in context_is_cleaned:
    if word.isalpha():
        segs, _ = morf.viterbi_segment(word)
        context_is_mo.extend(s for s in segs)
    else:
        context_is_mo.append(word) # non-letter char
context_is_mo

In [57]:
start_context = torch.zeros((1, CONTEXT_WINDOW_SIZE), dtype=torch.long, device=device)
uncond_gen = (tlm_la.generate(start_context, max_new_tokens=50)[0].tolist())
print(decode(uncond_gen))

































































































































LEONTESamPAULINA:
Were I ntolerable I Do Shame And defend And 'tis To Believe't.

First 
I must stick 't In canker'


In [64]:
context_is_tokens = torch.tensor(encode(context_is_mo), device=device).reshape(1, -1)
print(context_is_tokens.shape)
cond_gen_is = (tlm_la.generate(context_is_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
print(decode(cond_gen_is))

torch.Size([1, 138])
JULIET:
'Tis but thy name that is my enemy;
Thou art thyself, though not a Montague.
What's Montague? it is nor hand, nor foot,
Nor arm, nor face, nor any other part
Belonging to a man. O, be some other name!
What's in a name? that which we call a rose
By any other name would smell as sweet;
Isum, i, and by to find better Tower.
Well, a lord, that I know this man's king.
Soft and here art thou split'st Romeo, man of my sad
My bounty of state and Dick, he sought to signify
And aid those that with no cause may contain out all
For though not cause were they chosen, our


In [70]:
context1 = """ROMEO:
He jests at scars that never felt a wound.
But, soft! what light through yonder window breaks?
It is the east, and Juliet is the sun.
Arise, fair sun, and kill the envious moon,
Who is already sick and pale with grief,
That thou her maid art far more fair than she:
Be not her maid, """

context1_cleaned = TOKEN_RE.findall(context1)
context1_mo = []
for word in context1_cleaned:
    if word.isalpha():
        segs, _ = morf.viterbi_segment(word)
        context1_mo.extend(s for s in segs)
    else:
        context1_mo.append(word) # non-letter char
context1_tokens = torch.tensor(encode(context1_mo), device=device).reshape(1, -1)
cond_gen_1 = (tlm_la.generate(context1_tokens, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())
print(decode(cond_gen_1))

ROMEO:
He jests at scars that never felt a wound.
But, soft! what light through yonder window breaks?
It is the east, and Juliet is the sun.
Arise, fair sun, and kill the envious moon,
Who is already sick and pale with grief,
That thou her maid art far more fair than she:
Be not her maid, till mistress be bereft,
Back for a night that ne'er else be Saw'st you take'd a rule?

Lis to thy bed to Romeo; and there's no fond
To speak even worse than we'll pluck thee to Mantua.

Nurse:
Tell her! how it did time the man do be much,
though it be moody to 


In [36]:
mha_model = MultiHeadedAttentionLM_labeled(morf_vocab_size, CONTEXT_WINDOW_SIZE, morf_probabilities=data_mo_labels)
mha_morf = mha_model.to(device)

# create a PyTorch optimizer
learning_rate = 6e-4
optimizer = torch.optim.AdamW(mha_model.parameters(), lr=learning_rate)

eval_interval = 200

loss_list = []

for it in tqdm(range(SMALL_ITERS)):

    # every once in a while evaluate the loss on train and val sets
    if it % eval_interval == 0 or it == SMALL_ITERS - 1:
        print(f"iteration {it}")
        losses = estimate_loss(mha_morf, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device, token_type='morf')
        print(f"step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, token_type='morf')

    # evaluate the loss
    logits, loss = mha_morf(xb, yb)
    loss_list.append(loss.detach().item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/1000 [00:00<?, ?it/s]

iteration 0
token type is morf
token type is morf
step 0: train loss 8.8930, val loss 8.8919


 20%|█▉        | 199/1000 [00:22<01:06, 12.14it/s]

iteration 200
token type is morf
token type is morf


 20%|██        | 201/1000 [00:28<12:32,  1.06it/s]

step 200: train loss 3.4561, val loss 3.6228


 40%|███▉      | 399/1000 [00:44<00:50, 11.94it/s]

iteration 400
token type is morf
token type is morf


 40%|████      | 401/1000 [00:50<09:33,  1.05it/s]

step 400: train loss 3.2130, val loss 3.4555


 60%|█████▉    | 599/1000 [01:07<00:33, 11.80it/s]

iteration 600
token type is morf
token type is morf


 60%|██████    | 601/1000 [01:13<06:26,  1.03it/s]

step 600: train loss 2.9324, val loss 3.3251


 80%|███████▉  | 799/1000 [01:30<00:17, 11.58it/s]

iteration 800
token type is morf
token type is morf


 80%|████████  | 801/1000 [01:36<03:16,  1.01it/s]

step 800: train loss 2.7393, val loss 3.3261


100%|█████████▉| 999/1000 [01:54<00:00, 11.52it/s]

iteration 999
token type is morf
token type is morf


100%|██████████| 1000/1000 [02:00<00:00,  8.32it/s]

step 999: train loss 2.5665, val loss 3.3462





In [39]:
torch.cuda.empty_cache()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### 3. attention bias Morfessor

In [35]:
class Head_morf(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, context_window_size, embed_size=384):
        """
        Args:
          head_size: int, size of the head embedding dimension (K)
          context_window_size: int, number of tokens considered in the past for attention (T)
          embed_size: int, size of the token embedding dimension (D)
        """
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(embed_size, head_size, bias=False) # query
        self.query = nn.Linear(embed_size, head_size, bias=False) # key
        self.value = nn.Linear(embed_size, embed_size, bias=False)
        self.bias_proj = nn.Linear(3, 1, bias=False)

        # not a param of the model, so registered as a buffer
        self.register_buffer('tril', torch.tril(
            torch.ones(context_window_size, context_window_size)))

    def forward(self, x, batch_prob=None):
        """
        Args:
          x: (B,T,D) tensor of token embeddings

        Returns:
          (B,T,D) tensor of attention-weighted token embeddings
        """
        _, T, _ = x.size()
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        attn_scores = q @ k.transpose(-2, -1)
        if batch_prob is not None:
          bias = self.bias_proj(batch_prob).squeeze(-1).unsqueeze(1)
          attn_scores = attn_scores + bias
        masked_scores = attn_scores.masked_fill(self.tril == 0, float('-inf'))
        masked_scores = masked_scores / self.head_size ** .5

        attn = F.softmax(masked_scores, dim=-1) @ v
        return attn

In [36]:
class MultiHeadAttention_morf(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, context_window_size, num_heads, head_size, embed_size=384):
        """
        Args:
            context_window_size: int, number of tokens considered in the past for attention (T)
            num_heads: int, number of heads (H)
            head_size: int, size of the head embedding dimension
            embed_size: int, size of the token embedding dimension
        """
        super().__init__()
        # TODO, your code below
        self.head_size = head_size
        self.heads = nn.ModuleList([Head_morf(head_size, context_window_size, embed_size) for _ in range(num_heads)])
        self.linear = nn.Linear(num_heads * embed_size, embed_size)

    def forward(self, x, batch_prob):
        attn_list = [head.forward(x, batch_prob) for head in self.heads]
        mhsa = torch.cat(attn_list, dim=-1)
        mhsa = self.linear(mhsa)
        return mhsa

In [40]:
class MultiHeadedAttentionLM_morf(nn.Module):

    def __init__(self, vocab_size, context_window_size, morf_probabilities=None, embed_size=384, num_heads=6, num_roles=3):
      super().__init__()
      self.head_size = embed_size // num_heads
      self.context_window_size = context_window_size


      self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
      self.position_embedding_table = nn.Embedding(context_window_size, embed_size)

      self.atten_head = MultiHeadAttention_morf(context_window_size, num_heads, self.head_size, embed_size)
      self.lm_head = nn.Linear(embed_size, vocab_size)
      self.morf_probabilities = morf_probabilities # [n, 3]
      # TODO: your code below


    def forward(self, token_ids, targets=None):
        """
        Args:
          token_ids: (B, T) token ids that make up the context (batch has size B, each entry in the
                     batch has length T)
          targets: (B, T) token ids corresponding to the target of each context in token_ids

        Returns:
          logits: (B, T, V), logits[b,t] gives the length V vector of logits for the next token
                  prediction in string b up to t tokens
          loss: scalar, negative log likelihood of target given context
        """
        # TODO: your code below
        B, T = token_ids.shape # (batch size, length)
        tok_emb = self.token_embedding_table(token_ids) # (B,T,D)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,D)
        # batch_prob = self.morf_probabilities[token_ids, :]
        batch_prob = self.morf_probabilities.to(token_ids.device)[token_ids]

        x = tok_emb + pos_emb # (B,T,D)

        x = self.atten_head(x, batch_prob)
        logits = self.lm_head(x) # (B,T,V)

        # TODO: your code here
        if targets is None:
            loss = None
        else:
            logits_flat = logits.view(-1, logits.size(-1))
            targets_flat = targets.view(-1)
            loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
          token_ids: (B, T) tensor of token ids to provide as context
          max_new_tokens: int, maximum number of new tokens to generate

        Returns:
          (B, T+max_new_tokens) tensor of context with new tokens appended
        """
        for _ in range(max_new_tokens):
            logits, _ = self.forward(token_ids)
            logits = logits[:, -1, :] # for each entry in the batch, gets the last token
            next_token = torch.argmax(logits, keepdim=True) # (B, 1)
            token_ids = torch.cat((token_ids, next_token), dim=-1)
        return token_ids

In [41]:
mha_model_bias = MultiHeadedAttentionLM_morf(morf_vocab_size, CONTEXT_WINDOW_SIZE, morf_probabilities=data_mo_labels)
mha_bias = mha_model_bias.to(device)

# create a PyTorch optimizer
learning_rate = 6e-4
optimizer = torch.optim.AdamW(mha_model_bias.parameters(), lr=learning_rate)

eval_interval = 200

loss_list = []

for it in tqdm(range(SMALL_ITERS)):

    # every once in a while evaluate the loss on train and val sets
    if it % eval_interval == 0 or it == SMALL_ITERS - 1:
        print(f"iteration {it}")
        losses = estimate_loss(mha_bias, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device, token_type='morf')
        print(f"step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, token_type='morf')

    # evaluate the loss
    logits, loss = mha_bias(xb, yb)
    loss_list.append(loss.detach().item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/1000 [00:00<?, ?it/s]

iteration 0
token type is morf
token type is morf
step 0: train loss 8.8699, val loss 8.8719


 20%|█▉        | 199/1000 [00:23<01:09, 11.47it/s]

iteration 200
token type is morf
token type is morf


 20%|██        | 201/1000 [00:29<13:06,  1.02it/s]

step 200: train loss 3.4834, val loss 3.6438


 40%|███▉      | 399/1000 [00:47<00:53, 11.14it/s]

iteration 400
token type is morf
token type is morf


 40%|████      | 401/1000 [00:53<10:06,  1.01s/it]

step 400: train loss 3.2159, val loss 3.4625


 60%|█████▉    | 599/1000 [01:12<00:38, 10.36it/s]

iteration 600
token type is morf
token type is morf


 60%|██████    | 601/1000 [01:18<07:12,  1.08s/it]

step 600: train loss 2.9458, val loss 3.3439


 80%|████████  | 800/1000 [01:39<00:20,  9.81it/s]

iteration 800
token type is morf
token type is morf


 80%|████████  | 801/1000 [01:46<07:11,  2.17s/it]

step 800: train loss 2.7680, val loss 3.3446


100%|█████████▉| 999/1000 [02:05<00:00, 10.17it/s]

iteration 999
token type is morf
token type is morf


100%|██████████| 1000/1000 [02:12<00:00,  7.55it/s]

step 999: train loss 2.6198, val loss 3.3751





## Submission Instructions

You will generate two PDFs: one from Part 1, which involves completing this Colab to create a transformer baseline; and one from the mini-project in Part 2, which will be your write-up of no longer than 4 pages. Be sure to include a link to your code for Part 2 somewhere in your writeup.

**Combine the two PDFs into a single PDF and submit on gradescope. Tag your PDF correctly.**

If you work in a group of two, submit one assignment on gradescope and tag your group members. If you complete the assignment individually, submit as usual.