In [11]:
import os
from convokit import Corpus, download

filename = "~/.convokit/downloads/friends-corpus"
corpus = Corpus(filename=os.path.expanduser(filename))

utterance = corpus.get_utterance('s07_e14_c01_u018')
print(utterance.speaker.id)
print(utterance.text)

Rachel Green
Well, can I keep the presents and still be 29?


In [12]:
import re
re_pattern = "[^0-9a-zA-Z,.?!' ]"

def get_example(utterance):
    speaker = utterance.speaker.id
    if speaker == "TRANSCRIPT_NOTE":
        # Only interested in conversations
        return ''
    # Limit vocabulary to 0-9, a-z, A-Z, and some punctuations (,.?!)
    return re.sub(re_pattern, '', utterance.text)

examples = list(map(get_example, corpus.iter_utterances()))
examples[0]

"There's nothing to tell! He's just some guy I work with!"

## Build the vocabulary

In [13]:
all_characters = set([])
for example in examples:
    all_characters.update(list(example))
all_characters = list(all_characters)
print("Dictionary size:", len(all_characters))

stoi = {s:i + 1 for i, s in enumerate(sorted(all_characters))}
itos = {i:s for s, i in stoi.items()}
print(stoi)

Dictionary size: 68
{' ': 1, '!': 2, "'": 3, ',': 4, '.': 5, '0': 6, '1': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15, '?': 16, 'A': 17, 'B': 18, 'C': 19, 'D': 20, 'E': 21, 'F': 22, 'G': 23, 'H': 24, 'I': 25, 'J': 26, 'K': 27, 'L': 28, 'M': 29, 'N': 30, 'O': 31, 'P': 32, 'Q': 33, 'R': 34, 'S': 35, 'T': 36, 'U': 37, 'V': 38, 'W': 39, 'X': 40, 'Y': 41, 'Z': 42, 'a': 43, 'b': 44, 'c': 45, 'd': 46, 'e': 47, 'f': 48, 'g': 49, 'h': 50, 'i': 51, 'j': 52, 'k': 53, 'l': 54, 'm': 55, 'n': 56, 'o': 57, 'p': 58, 'q': 59, 'r': 60, 's': 61, 't': 62, 'u': 63, 'v': 64, 'w': 65, 'x': 66, 'y': 67, 'z': 68}


## Build the dataset

In [14]:
import torch
import torch.nn.functional as F

SPECIAL_SE = '@'
stoi[SPECIAL_SE] = 0
itos[0] = SPECIAL_SE


def build_dataset(words, block_size):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for c in w + '@':
            ix = stoi[c]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # 'abcd' -> 'bcde'

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y