In [None]:
%pylab inline

import matplotlib.pyplot as plt
import requests
import zipfile
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe, vocab


from functools import partial,reduce
from tqdm import tqdm, trange
tqdm = partial(tqdm, position=0, leave=True)
trange = partial(trange, position=0, leave=True)

import numpy as np


DEVICE = 'cuda:0'

Populating the interactive namespace from numpy and matplotlib


In [None]:
url = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip'

r = requests.get(url)

ul = url.split('/')
name = ul[len(ul) - 1]

with open(name, 'wb') as file:
  file.write(r.content)

with zipfile.ZipFile(name, "r") as zip_ref:
  zip_ref.extractall("./")

!mv 'cornell movie-dialogs corpus' 'data'
!ls 'data'

mv: cannot move 'cornell movie-dialogs corpus' to 'data/cornell movie-dialogs corpus': Directory not empty
 chameleons.pdf			 movie_lines.txt
'cornell movie-dialogs corpus'	 movie_titles_metadata.txt
 movie_characters_metadata.txt	 raw_script_urls.txt
 movie_conversations.txt	 README.txt


In [None]:
FIELD_SPLITTER = '+++$+++'

MAX_SAMPLES = 50000
MAX_LENGTH = 40

UNK_TOKEN = '<unk>'
PAD_TOKEN = '<PAD>'
BOS_TOKEN = '<BOS>'
EOS_TOKEN = '<EOS>'

UNK_TOKEN_IND = 0
PAD_TOKEN_IND = 1
BOS_TOKEN_IND = 2
EOS_TOKEN_IND = 3

BATCH = 1

In [None]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

In [None]:
text_transform = lambda x, voc, tokenizer: [voc['<BOS>']] + [voc[token] for token in tokenizer(x)] + [voc['<EOS>']]

In [None]:
# this is terrible as fuck because torchtext is terrible as fuck
def load_conversations(path_to_movie_lines, path_to_movie_conversations):
    id2line = {}
    with open(path_to_movie_lines, errors='ignore') as file:
        lines = file.readlines()
        for line in lines:
            parts = line.replace('\n', '').split(' +++$+++ ')
            id2line[parts[0]] = parts[4]

    inputs, outputs = [], []
    with open(path_to_movie_conversations, 'r') as file:
        lines = file.readlines()
        for line in lines:
            parts = line.replace('\n', '').split(' +++$+++ ')
            conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
            for i in range(len(conversation) - 1):
                inputs.append(preprocess_sentence(id2line[conversation[i]]))
                outputs.append(preprocess_sentence(id2line[conversation[i + 1]]))
                if len(inputs) >= MAX_SAMPLES:
                    return inputs, outputs
    return inputs, outputs


In [None]:
def get_dataloader(path_to_movie_lines,
                   path_to_movie_conversations):
    questions, answers = load_conversations(path_to_movie_lines, path_to_movie_conversations)

    tokenizer = get_tokenizer('basic_english')

    counter = Counter()
    for sent in questions + answers:
        counter.update(tokenizer(sent))

    voc = vocab(counter)
    voc.insert_token(token=UNK_TOKEN, index=UNK_TOKEN_IND)
    voc.set_default_index(index=UNK_TOKEN_IND)
    voc.insert_token(token=PAD_TOKEN, index=PAD_TOKEN_IND)
    voc.insert_token(token=BOS_TOKEN, index=BOS_TOKEN_IND)
    voc.insert_token(token=EOS_TOKEN, index=EOS_TOKEN_IND)

    q_tokenized = [text_transform(t, voc, tokenizer) for t in questions]
    a_tokenized = [text_transform(t, voc, tokenizer) for t in answers]

    import tensorflow as tf # todo
    q_padded = tf.keras.preprocessing.sequence.pad_sequences(
        q_tokenized, maxlen=MAX_LENGTH, padding='post', value=1.0)

    a_padded = tf.keras.preprocessing.sequence.pad_sequences(
        a_tokenized, maxlen=MAX_LENGTH, padding='post', value=1.0)

    print("Vocab len", len(voc))

    dataloader = DataLoader(
        list(
            zip(
                  q_padded.astype(np.float32),
                  a_padded.astype(np.float32),
                )
            ),
            batch_size=BATCH,
            shuffle=False,
    )

    print(voc)
    torch.save(voc, 'vocab')

    return dataloader, text_transform, voc


In [None]:
lines_path = 'data/movie_lines.txt'
conversations_path = 'data/movie_conversations.txt'    

dataloader, text_transform, voc = get_dataloader(lines_path,
                                                 conversations_path)

Vocab len 23068
Vocab()


In [None]:
test_sample = None

for i,x in enumerate(dataloader):
  if i > 1: break
  print(x[0].shape)
  test_sample = x[0]

torch.Size([1, 40])
torch.Size([1, 40])


In [116]:
class LSTM(nn.Module):
  def __init__(self, isize, osize):
    super(LSTM, self).__init__()

    self.isize = isize
    self.osize = osize

    self.forget_cell = nn.Linear(isize, osize)

  def forget_gate(self, x_t, h_tm1):
    wx = self.forget_cell(x_t)
    # wh = self.forget_cell(h_tm1)
    wh = torch.bmm(self.forget_cell.weight, h_tm1) # questionable
    out = wx + wh
    return out
  
  def forward(self, x, h0=None, c0=None):
    if not h0:
      h0 = torch.zeros(x.shape[0], self.isize, self.osize)
    if not c0:
      c0 = torch.zeros(x.shape[0], self.isize, self.osize)

    x = self.forget_gate(x, h0)
    return x, (h0, c0)


In [117]:
my_lstm = LSTM(40, 80)

res, _ = my_lstm(test_sample)

print(res.shape)

RuntimeError: ignored

In [None]:
print(test_sample.shape)
print(test_sample.dtype)

torch.Size([1, 40])
torch.float32


In [92]:
t_lstm = nn.LSTM(40, 80, batch_first=True)

res, (h_s, c_s) = t_lstm(test_sample)

print(res)
print('\n---------')
print(h_s)
print('\n---------')
print(c_s)
print('\n---------')

tensor([[ 1.8355e-04, -7.4787e-01,  4.7529e-07, -6.5586e-09, -1.9901e-06,
          3.0464e-04, -6.2128e-02, -3.6935e-01, -4.1232e-01,  1.0612e-02,
         -1.6042e-08, -1.7682e-15,  1.0477e-02, -5.2509e-03, -5.9016e-05,
          7.1852e-08,  7.5987e-01, -1.3082e-03, -1.2029e-04, -6.3735e-04,
         -4.5338e-10,  3.4926e-01,  4.5784e-04, -1.1458e-03, -7.1221e-02,
         -7.8594e-02, -9.8207e-05, -6.2047e-03, -3.5558e-01,  3.2264e-01,
         -1.6236e-02, -7.4238e-05,  6.5434e-01, -7.4483e-01,  2.2936e-01,
          7.4398e-01, -4.0374e-08,  9.0992e-03,  7.8316e-04, -2.6333e-04,
         -7.6153e-01, -5.4519e-01,  5.1999e-01, -1.5924e-01,  1.7522e-01,
          1.4311e-05,  5.8986e-01,  1.6562e-02, -6.0404e-02,  8.6899e-02,
          1.1045e-03,  6.5779e-01,  7.1191e-03,  4.3064e-08,  4.7039e-06,
          2.6200e-08,  7.5646e-01,  4.9864e-01, -3.5055e-01, -3.4745e-01,
          7.2702e-01,  2.7896e-04,  5.5958e-01, -5.9909e-01,  7.9317e-07,
          7.4248e-01,  4.6254e-06,  3.

In [None]:
!cat data/movie_conversations.txt | tail -n 10

u9027 +++$+++ u9029 +++$+++ m616 +++$+++ ['L666460', 'L666461']
u9027 +++$+++ u9029 +++$+++ m616 +++$+++ ['L666485', 'L666486']
u9027 +++$+++ u9029 +++$+++ m616 +++$+++ ['L666546', 'L666547']
u9028 +++$+++ u9033 +++$+++ m616 +++$+++ ['L666497', 'L666498', 'L666499', 'L666500', 'L666501', 'L666502']
u9028 +++$+++ u9031 +++$+++ m616 +++$+++ ['L666262', 'L666263', 'L666264']
u9028 +++$+++ u9031 +++$+++ m616 +++$+++ ['L666324', 'L666325', 'L666326', 'L666327']
u9028 +++$+++ u9031 +++$+++ m616 +++$+++ ['L666575', 'L666576']
u9030 +++$+++ u9034 +++$+++ m616 +++$+++ ['L666256', 'L666257']
u9030 +++$+++ u9034 +++$+++ m616 +++$+++ ['L666369', 'L666370', 'L666371', 'L666372']
u9030 +++$+++ u9034 +++$+++ m616 +++$+++ ['L666520', 'L666521', 'L666522']


In [None]:
!ls data

 chameleons.pdf			 movie_lines.txt
'cornell movie-dialogs corpus'	 movie_titles_metadata.txt
 movie_characters_metadata.txt	 raw_script_urls.txt
 movie_conversations.txt	 README.txt


In [None]:
!cat data/README.txt

Cornell Movie-Dialogs Corpus

Distributed together with:

"Chameleons in imagined conversations: A new approach to understanding coordination of linguistic style in dialogs"
Cristian Danescu-Niculescu-Mizil and Lillian Lee
Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics, ACL 2011.

(this paper is included in this zip file)

NOTE: If you have results to report on these corpora, please send email to cristian@cs.cornell.edu or llee@cs.cornell.edu so we can add you to our list of people using this data.  Thanks!


Contents of this README:

	A) Brief description
	B) Files description
	C) Details on the collection procedure
	D) Contact


A) Brief description:

This corpus contains a metadata-rich collection of fictional conversations extracted from raw movie scripts:

- 220,579 conversational exchanges between 10,292 pairs of movie characters
- involves 9,035 characters from 617 movies
- in total 304,713 utterances
- movie metadata included:
	- genres
	- rele

In [None]:
# # trash




# def preprocess(x):
#   x_no_new = x.replace('\n', '')
#   text = x_no_new.split(FIELD_SPLITTER).pop()
#   embedding = g_vectors.get_vecs_by_tokens(tokenizer(text), lower_case_backup=True)
#   return embedding

# tokenizer = get_tokenizer('basic_english')
# g_vectors = GloVe(name='840B')
# g_vocab = vocab(g_vectors.stoi)


# train_iter = tt.data.BucketIterator(
#   dataset=train_obj,
#   batch_size = 2,
#   sort_key=lambda x: len(x.review),
#   shuffle=True,
#   device=DEVICE
# )

# trainloader = torch.utils.data.DataLoader(
# 	,
# 	batch_size=BATCH,
# 	num_workers=12,
# 	shuffle=True
# )

In [None]:

# embeddings = global_vectors.get_vecs_by_tokens(tokenizer("Hello, How are you?"),
#                                                lower_case_backup=True)
# embeddings
# 
# 
# 
# def batch(iterable, size):
#     from itertools import chain, islice
#     iterator = iter(iterable)
#     for first in iterator:
#         yield list(chain([first], islice(iterator, size - 1)))