In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [2]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [3]:
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [4]:
with open (lines_filepath, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines[:8]:
        print(line.strip()) 

L1045	u0	m0	BIANCA	They do not!
L1044	u2	m0	CAMERON	They do to!
L985	u0	m0	BIANCA	I hope so.
L984	u2	m0	CAMERON	She okay?
L925	u0	m0	BIANCA	Let's go.
L924	u2	m0	CAMERON	Wow
L872	u0	m0	BIANCA	Okay -- you're gonna need to learn how to lie.
L871	u2	m0	CAMERON	No


In [5]:
line_fields = ["lineID", "characterID","movieID","character","text"]
lines = {}
with open (lines_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split("\t")
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj

In [6]:
conv_fields = ["character1ID", "character2ID","movieID","utteranceIDs"]
conversations = []
with open (conv_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split("\t")
        convObj = {}
        for i,field in enumerate(conv_fields):
            convObj[field] = values[i]        
        lineIds = eval(convObj["utteranceIDs"])
        split_lineIds = [re.findall(r'L\d+', string[0]) for string in lineIds]
        temp = []
        for string in lineIds:
            temp = re.findall(r'L\d+', string)
        
        convObj["lines"] = []
        for lineId in temp:
            convObj["lines"].append(lines[lineId])
        conversations. append(convObj)

In [7]:
qa_pairs = []
for conversation in conversations:
    for i in range(len(conversation["lines"]) -1):
        inputLine = conversation["lines"][i]["text"].replace("\r\r\n","").strip()
        targetLine = conversation["lines"][i+1]["text"].replace("\r\r\n","").strip()
        if inputLine and targetLine:
            qa_pairs.append([inputLine,targetLine])

In [8]:
datafile = os.path.join("cornell movie-dialogs corpus", "formated_movie_lines.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

print("\nWriting newly formatted file...")
with open(datafile,'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print("Done writing file")


Writing newly formatted file...
Done writing file


In [9]:
with open(datafile, 'rb') as file:
    lines = file.readlines()
    for line in lines[:8]:
        print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell I thought we'd start with pronunciation if that's okay with you.\r\r\n"
b"Well I thought we'd start with pronunciation if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No no it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tUnsolved 

In [10]:
PAD_token = 0 # used for padding short sentences
SOS_token = 1 # start of sentence token <START
EOS_token = 2 # end of sentence token <END>

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print("keep words {} / {} = {:.4f}".format(len(keep_words), len(self.word2index), len(keep_words)/len(self.word2index)))
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3
        for word in keep_words:
            self.addWord(word)

In [11]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [12]:
unicodeToAscii("Montréal,Françoise....")

'Montreal,Francoise....'

In [13]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])",r" ", s)
    s = re.sub(r"[^a-zA-Z.!?]+",r" ",s)
    s = re.sub(r"\s", r" ", s).strip()
    return s

In [14]:
normalizeString("aa123aa!s's  dd?")

'aa aa s s dd'

In [15]:
print("Reading and processing file....Please wait")
lines = open(datafile, 'r', encoding='utf-8').read().strip().split('\n')
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines ]
print("Done reading")
voc = Vocabulary("cornell movie-dialogs corpus")

Reading and processing file....Please wait
Done reading


In [16]:
MAX_LENGTH = 10
# return true if the pairs are under the threshhold
def filterPair(p):
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [17]:
pairs = [pair for pair in pairs if len(pair)>1]
print("There are {} pairs/conversations in the dataset)".format(len(pairs)))
pairs = filterPairs(pairs)
print("After filtering, there are {} pairs/conversations".format(len(pairs)))

There are 221275 pairs/conversations in the dataset)
After filtering, there are 84210 pairs/conversations


In [18]:
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
for pair in pairs[:10]:
    print(pair)

Counted words: 21492
['gosh if only we could find kat a boyfriend', 'let me see what i can do']
['c esc ma tete this is my head', 'right see you re ready for the quiz']
['that s because it s such a nice one', 'forget french']
['there', 'where']
['you have my word as a gentleman', 'you re sweet']
['hi', 'looks like things worked out tonight huh']
['you know chastity', 'i believe we share an art instructor']
['have fun tonight', 'tons']
['well no', 'then that s all you had to say']
['then that s all you had to say', 'but']


In [19]:
MIN_COUNT = 3
def trimRareWords(voc,pairs, MIN_COUNT):
    voc.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index: 
                keep_input =False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index: 
                keep_output =False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs),  len(keep_pairs)/len(pairs)))
    return keep_pairs

pairs = trimRareWords(voc,pairs,MIN_COUNT)

keep words 9978 / 21489 = 0.4643
Trimmed from 84210 pairs to 71445, 0.8484 of total


In [20]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [21]:
indexesFromSentence(voc, pairs[1][0])

[31, 32, 33, 34, 32, 35, 10, 36, 37, 2]

In [22]:
inp = []
out = []
i = 0
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['gosh if only we could find kat a boyfriend', 'that s because it s such a nice one', 'there', 'you have my word as a gentleman', 'hi', 'have fun tonight', 'well no', 'then that s all you had to say', 'but', 'do you listen to this crap']
10


[[3, 4, 5, 6, 7, 8, 9, 10, 11, 2],
 [31, 32, 33, 34, 32, 35, 10, 36, 37, 2],
 [40, 2],
 [26, 42, 23, 43, 44, 10, 45, 2],
 [47, 2],
 [42, 60, 53, 2],
 [62, 63, 2],
 [64, 31, 32, 65, 26, 66, 67, 68, 2],
 [69, 2],
 [18, 26, 73, 67, 21, 74, 2]]

In [23]:
def zeroPadding(l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [24]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [25]:
test_result = zeroPadding(indexes)
print(len(test_result))
test_result

10


[(3, 31, 40, 26, 47, 42, 62, 64, 69, 18),
 (4, 32, 2, 42, 2, 60, 63, 31, 2, 26),
 (5, 33, 0, 23, 0, 53, 2, 32, 0, 73),
 (6, 34, 0, 43, 0, 2, 0, 65, 0, 67),
 (7, 32, 0, 44, 0, 0, 0, 26, 0, 21),
 (8, 35, 0, 10, 0, 0, 0, 66, 0, 74),
 (9, 10, 0, 45, 0, 0, 0, 67, 0, 2),
 (10, 36, 0, 2, 0, 0, 0, 68, 0, 0),
 (11, 37, 0, 0, 0, 0, 0, 2, 0, 0),
 (2, 2, 0, 0, 0, 0, 0, 0, 0, 0)]

In [26]:
def binaryMatrix(l, values=0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token: 
                m[i].append(0)
            else: 
                m[i].append(1)
    return m

In [27]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 0, 1, 0, 1, 1, 1, 0, 1],
 [1, 1, 0, 1, 0, 1, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

In [28]:
# return padded input sequence tensor and as well as a tensor of lengths for each of the sequences in the batch
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentenc in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [29]:
# returns the padded target sequence tensor, paddinbg mask and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentenc in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [30]:
# return all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch,voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len