In [1]:
%pylab inline

import matplotlib.pyplot as plt
import requests
import zipfile
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab


from functools import partial,reduce
from tqdm import tqdm, trange
tqdm = partial(tqdm, position=0, leave=True)
trange = partial(trange, position=0, leave=True)

import numpy as np


DEVICE = 'cuda:0'

Populating the interactive namespace from numpy and matplotlib


In [2]:
!nvidia-smi

Thu Jul 21 14:49:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    27W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
url = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip'

r = requests.get(url)

ul = url.split('/')
name = ul[len(ul) - 1]

with open(name, 'wb') as file:
  file.write(r.content)

with zipfile.ZipFile(name, "r") as zip_ref:
  zip_ref.extractall("./")

!mv 'cornell movie-dialogs corpus' 'data'
!ls 'data'

chameleons.pdf		       movie_lines.txt		  README.txt
movie_characters_metadata.txt  movie_titles_metadata.txt
movie_conversations.txt        raw_script_urls.txt


In [4]:
FIELD_SPLITTER = '+++$+++'

MAX_SAMPLES = 50000
MAX_LENGTH = 40

UNK_TOKEN = '<unk>'
PAD_TOKEN = '<PAD>'
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'

UNK_TOKEN_IND = 0
PAD_TOKEN_IND = 1
BOS_TOKEN_IND = 2
EOS_TOKEN_IND = 3

BATCH = 128

In [5]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

In [6]:
text_transform = lambda x, voc, tokenizer: [voc['<BOS>']] + [voc[token] for token in tokenizer(x)] + [voc['<EOS>']]

In [7]:
# this is terrible as fuck because torchtext is terrible as fuck
def load_conversations(path_to_movie_lines, path_to_movie_conversations):
    id2line = {}
    with open(path_to_movie_lines, errors='ignore') as file:
        lines = file.readlines()
        for line in lines:
            parts = line.replace('\n', '').split(' +++$+++ ')
            id2line[parts[0]] = parts[4]

    inputs, outputs = [], []
    with open(path_to_movie_conversations, 'r') as file:
        lines = file.readlines()
        for line in lines:
            parts = line.replace('\n', '').split(' +++$+++ ')
            conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
            for i in range(len(conversation) - 1):
                inputs.append(preprocess_sentence(id2line[conversation[i]]))
                outputs.append(preprocess_sentence(id2line[conversation[i + 1]]))
                if len(inputs) >= MAX_SAMPLES:
                    return inputs, outputs
    return inputs, outputs


In [8]:
def get_dataloader(path_to_movie_lines,
                   path_to_movie_conversations):
    questions, answers = load_conversations(path_to_movie_lines, path_to_movie_conversations)

    tokenizer = get_tokenizer('basic_english')

    counter = Counter()
    for sent in questions + answers:
        counter.update(tokenizer(sent))

    voc = vocab(counter)
    voc.insert_token(token=UNK_TOKEN, index=UNK_TOKEN_IND)
    voc.set_default_index(index=UNK_TOKEN_IND)
    voc.insert_token(token=PAD_TOKEN, index=PAD_TOKEN_IND)
    voc.insert_token(token=BOS_TOKEN, index=BOS_TOKEN_IND)
    voc.insert_token(token=EOS_TOKEN, index=EOS_TOKEN_IND)

    q_tokenized = [text_transform(t, voc, tokenizer) for t in questions]
    a_tokenized = [text_transform(t, voc, tokenizer) for t in answers]

    import tensorflow as tf # todo
    q_padded = tf.keras.preprocessing.sequence.pad_sequences(
        q_tokenized, maxlen=MAX_LENGTH, padding='post', value=1.0)

    a_padded = tf.keras.preprocessing.sequence.pad_sequences(
        a_tokenized, maxlen=MAX_LENGTH, padding='post', value=1.0)

    print("Vocab len", len(voc))

    dataloader = DataLoader(
        list(
            zip(
                  q_padded.astype(np.float32),
                  a_padded.astype(np.float32),
                )
            ),
            batch_size=BATCH,
            shuffle=False,
    )

    print(voc)
    torch.save(voc, 'vocab')

    return dataloader, text_transform, voc


In [9]:
lines_path = 'data/movie_lines.txt'
conversations_path = 'data/movie_conversations.txt'    

dataloader, text_transform, voc = get_dataloader(lines_path,
                                                 conversations_path)

Vocab len 23068
Vocab()


In [10]:
test_sample = None

for i,x in enumerate(dataloader):
  if i > 1: break
  print(x[0].shape)
  test_sample = x[0]

torch.Size([128, 40])
torch.Size([128, 40])


In [11]:
# https://arxiv.org/pdf/1706.03762.pdf
def sdpa_attention(q, k, v):
    d = torch.sqrt(torch.tensor(k.shape[len(k.shape) - 1]))
    scales = torch.matmul(q, k) / d
    # mask?
    weights = f.softmax(scales, dim=-1)
    res = torch.matmul(weights, v)
    return res
    

In [12]:
class MultiheadAttention(nn.Module):
  def __init__(self):
    super(MultiheadAttention, self).__init__()

    self.V = nn.Linear(1, 1)
    self.K = nn.Linear(1, 1)
    self.Q = nn.Linear(1, 1)
    self.O = nn.Linear(1, 1)

  def forward(self, q, k, v):
    q = self.Q(q)
    k = self.Q(k)
    v = self.Q(v)

    a = sdpa_attention(q, k, v)
    o = self.O(a)

    return o


In [13]:
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding1(nn.Module):
    def __init__(self, vocab_len, dropout=0.1):
      super(PositionalEncoding, self).__init__()
      self.dropout = nn.Dropout(p=dropout)

      pe = torch.zeros(MAX_LENGTH, vocab_len)
      position = torch.arange(0, MAX_LENGTH, dtype=torch.float).unsqueeze(1)
      div_term = torch.exp(torch.arange(0, vocab_len, 2).float() *
                           -(torch.log(torch.tensor(10000.0)).item() / vocab_len))
      pe[:, 0::2] = torch.sin(position * div_term)
      pe[:, 1::2] = torch.cos(position * div_term)
      pe = pe.unsqueeze(0).transpose(0, 1)
      self.register_buffer('pe', pe)

    def forward(self, x):
      y = self.pe[:x.size(0), :]
      print('Y SHAPE', y.shape)
      print('X SHAPE', x.shape)
      x = x + y
      return self.dropout(x)

In [14]:
print(test_sample.shape)
print(test_sample.dtype)

torch.Size([128, 40])
torch.float32


In [15]:
class Transformer(nn.Module):
  def __init__(self, vocab_len, embed_dim=64):
    super(Transformer, self).__init__()
    self.attention = MultiheadAttention()
    self.norm = nn.LayerNorm(embed_dim)
    self.lin = nn.Linear(embed_dim, embed_dim)

  def forward(self, x):
    x = self.norm(x + self.attention(x, x, x))
    x = self.norm(x + self.lin(x))
    return x


In [28]:
# https://medium.com/the-dl/transformers-from-scratch-in-pytorch-8777e346ca51
def positional_encoding(seq_len, vocab_len):
    pos = torch.arange(seq_len, dtype=torch.float, device=DEVICE).reshape(1, -1, 1)
    dim = torch.arange(vocab_len, dtype=torch.float, device=DEVICE).reshape(1, 1, -1)
    phase = pos / (1e4 ** (dim // vocab_len))

    return torch.where(dim.long() % 2 == 0, torch.sin(phase), torch.cos(phase))

In [111]:
class Model(nn.Module):
  def __init__(self, vocab_len, emb_dim, enc_size, dec_size, inp_features, nhead):
    super(Model, self).__init__()
    self.vocab_len = vocab_len
    self.emb_dim = emb_dim
    self.inp_features = inp_features
    
    # self.p_encoding = PositionalEncoding(vocab_len)
    self.embedding = nn.Embedding(vocab_len, emb_dim)

    self.encoder = nn.Sequential(
      *[nn.TransformerEncoderLayer(inp_features, nhead) for _ in range(enc_size)]
    )
    self.decoder = nn.Sequential(
      *[nn.TransformerDecoderLayer(inp_features, nhead) for _ in range(dec_size)]
    )

    self.lin = nn.Linear(inp_features, inp_features)

  
  def forward(self, x, y):
    x = self.embedding(x.int())
    e_enc = positional_encoding(x.shape[1], x.shape[2])
    x += e_enc
    x = self.encoder(x)

    # d_enc = positional_encoding(y.shape[0], y.shape[1])
    # y = y.reshape(1, y.shape[0], y.shape[1])
    print('\nY SHAPE', y.shape)
    print('\n ENC SHAPE', e_enc.shape)
    y += e_enc
    print('BEFORE')
    y = self.decoder(y, x)
    print('AFTER')
    y = torch.softmax(self.lin(y), dim=-1)

    return y


In [112]:
emb_dim = 200
num_heads = 8
enc_size = 1
dec_size = 1

model = Model(len(voc), emb_dim, enc_size, dec_size, emb_dim, num_heads).to(DEVICE)

In [113]:
EPOCHS = 1
losses = []

optimizer = optim.Adam(params=model.parameters(), lr=0.001)
loss_fn = nn.NLLLoss()

In [114]:
for epoch in trange(EPOCHS):
  for i, data in tqdm(enumerate(dataloader)):
    i_seq, answ = data[0].to(DEVICE), data[1].to(DEVICE)
    print()

    out = model(i_seq) # todo
    print(out.shape)

    break

  """
0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


Y SHAPE torch.Size([1, 128, 40])

 ENC SHAPE torch.Size([1, 128, 40])
BEFORE





TypeError: ignored

In [None]:
plt.plot(range(len(losses)), losses, 'b', label='Loss') plt.title('Losses')
plt.show()

In [None]:
query = 'test'

q_tr = text_transform(query, voc, get_tokenizer('basic_english'))
import tensorflow as tf
q_p = tf.keras.preprocessing.sequence.pad_sequences(
    [q_tr],
    maxlen=MAX_LENGTH,
    padding='post',
    value=1.0
)
q_t = torch.Tensor(q_p).float().to(DEVICE)


enc = encoder(q_t)
dec, _ = decoder(enc)

print(dec.int())

tokens = [t.int().item() for t in dec[0]]
words = voc.get_itos()
result = ' '.join(list(filter(lambda w: '<' not in w and '>' not in w, [words[t] for t in tokens])))
print('\n\n')
print(result)



In [None]:
!tail -n 10 data/movie_conversations.txt 

In [None]:
!ls data

In [None]:
!cat data/README.txt

In [None]:
# # trash




# def preprocess(x):
#   x_no_new = x.replace('\n', '')
#   text = x_no_new.split(FIELD_SPLITTER).pop()
#   embedding = g_vectors.get_vecs_by_tokens(tokenizer(text), lower_case_backup=True)
#   return embedding

# tokenizer = get_tokenizer('basic_english')
# g_vectors = GloVe(name='840B')
# g_vocab = vocab(g_vectors.stoi)


# train_iter = tt.data.BucketIterator(
#   dataset=train_obj,
#   batch_size = 2,
#   sort_key=lambda x: len(x.review),
#   shuffle=True,
#   device=DEVICE
# )

# trainloader = torch.utils.data.DataLoader(
# 	,
# 	batch_size=BATCH,
# 	num_workers=12,
# 	shuffle=True
# )

In [None]:

# embeddings = global_vectors.get_vecs_by_tokens(tokenizer("Hello, How are you?"),
#                                                lower_case_backup=True)
# embeddings
# 
# 
# 
# def batch(iterable, size):
#     from itertools import chain, islice
#     iterator = iter(iterable)
#     for first in iterator:
#         yield list(chain([first], islice(iterator, size - 1)))