In [1]:
import helper
data_dir = './Seinfeld_Scripts.txt'
text = helper.load_data(data_dir)

In [2]:
view_line_range = (0, 10)
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 46367
Number of lines: 109233
Average number of words in each line: 5.544240293684143

The lines 0 to 10:
jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people trying to find us, they dont know where we are. (on an imaginary phone) did you ring?, i cant find him. where did he go? he didnt tell me where he was going. he must have gone out. you wanna go out you get ready, you pick out the clothes, right? you take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...then youre standing around, what do you do? you go we gotta be getting back. once youre out, you wanna get back! y

In [3]:

import TestCase as tests
from collections import Counter

def create_lookup_tables(text):
    word_count = Counter(text)
    
    sorted_vocab = sorted(word_count, key=word_count.get, reverse=True)
    
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return (vocab_to_int, int_to_vocab)

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_create_lookup_tables(create_lookup_tables)


Tests Passed


In [4]:
def token_lookup():
    tokens = dict()
    tokens['.'] = '||period||'
    tokens[','] = '||comma||'
    tokens['"'] = '||quotation_mark||'
    tokens[';'] = '||semicolon||'
    tokens['!'] = '||exclam_mark||'
    tokens['?'] = '||question_mark||'
    tokens['('] = '||left_par||'
    tokens[')'] = '||right_par||'
    tokens['-'] = '||dash||'
    tokens['\n'] = '||return||'
    
    return tokens
tests.test_tokenize(token_lookup)

Tests Passed


In [5]:
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [6]:
import helper
import TestCase as tests
import numpy as np


int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

In [7]:
import torch
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

In [8]:
from torch.utils.data import TensorDataset, DataLoader


def batch_data(words, sequence_length, batch_size):
    X = []
    y = []
    
    for i in range(0,len(words)-sequence_length):
        X.append(words[i:i+sequence_length])
        y.append(words[i+sequence_length])
    
    data = TensorDataset(torch.from_numpy(np.array(X, dtype=np.uint8)),
                         torch.from_numpy(np.array(y, dtype=np.uint8)))
    
    loader = DataLoader(data, shuffle=True, batch_size=batch_size)
    
    return loader

test_words =range(11)
loader = batch_data(test_words,3,4)

i = 0
for x_data, y_data in loader:
    i += 1
    print('##Batch {}\n'.format(i))
    print(x_data.size())
    print(x_data.data)
    print('\n')
    print(y_data.size())
    print(y_data.data)
    print('\n\n\n')

##Batch 1

torch.Size([4, 3])
tensor([[7, 8, 9],
        [6, 7, 8],
        [2, 3, 4],
        [1, 2, 3]], dtype=torch.uint8)


torch.Size([4])
tensor([10,  9,  5,  4], dtype=torch.uint8)




##Batch 2

torch.Size([4, 3])
tensor([[3, 4, 5],
        [0, 1, 2],
        [4, 5, 6],
        [5, 6, 7]], dtype=torch.uint8)


torch.Size([4])
tensor([6, 3, 7, 8], dtype=torch.uint8)






In [9]:
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[19, 20, 21, 22, 23],
        [23, 24, 25, 26, 27],
        [44, 45, 46, 47, 48],
        [40, 41, 42, 43, 44],
        [28, 29, 30, 31, 32],
        [21, 22, 23, 24, 25],
        [30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [ 6,  7,  8,  9, 10],
        [33, 34, 35, 36, 37]], dtype=torch.uint8)

torch.Size([10])
tensor([24, 28, 49, 45, 33, 26, 35, 40, 11, 38], dtype=torch.uint8)


In [10]:
import torch.nn as nn
import TestCase as tests

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
      
        super(RNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.output_size = output_size
        self.n_layers = n_layers
        
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=dropout, batch_first=True)
        
       
        self.dropout = nn.Dropout(dropout)
        
       
        self.fc = nn.Linear(hidden_dim, output_size)
        
    
    def forward(self, nn_input, hidden):
        
        nn_input_long = nn_input.long()
        embeds = self.embedding(nn_input_long)
        
        
        lstm_out, hidden = self.lstm(embeds,hidden)
        
       
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        
        out = out.view(hidden[0].shape[1], -1, self.output_size)
        final_output = out[:,-1]
        
        
       
        return final_output, hidden
    
    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers,batch_size,self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers,batch_size,self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers,batch_size,self.hidden_dim).zero_(),
                      weight.new(self.n_layers,batch_size,self.hidden_dim).zero_())
        
        return hidden

tests.test_rnn(RNN, train_on_gpu)

Tests Passed


In [11]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    if (train_on_gpu):
        inp, target = inp.cuda(), target.cuda()
    
    
    hidden = tuple([each.data for each in hidden])
    
    
    rnn.zero_grad()
    
    
    out, hidden = rnn(inp, hidden)
    
   
    loss = criterion(out, target.long())
    loss.backward()
    
    
    optimizer.step()

    
    return loss.item(), hidden

tests.test_forward_back_prop(RNN, forward_back_prop, train_on_gpu)

Tests Passed


In [13]:

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
           
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
           
            batch_losses.append(loss)

           
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    return rnn

In [14]:
import numpy as np


sequence_length = 10 

batch_size = 128


train_loader = batch_data(int_text, sequence_length, batch_size)

In [15]:
num_epochs = 9
learning_rate = 0.0001
vocab_size = len(vocab_to_int) + 1
output_size = len(vocab_to_int)
embedding_dim = 100
hidden_dim = 512
n_layers = 2
show_every_n_batches = 500

In [16]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)
helper.save_model('trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 9 epoch(s)...
Epoch:    1/9     Loss: 5.2582930002212525

Epoch:    1/9     Loss: 4.400785830974579

Epoch:    1/9     Loss: 4.267888197422027

Epoch:    1/9     Loss: 4.199936118602753

Epoch:    1/9     Loss: 4.140326580047607

Epoch:    1/9     Loss: 4.117083817958831

Epoch:    1/9     Loss: 4.0505142855644225

Epoch:    1/9     Loss: 3.9987132506370546

Epoch:    1/9     Loss: 3.973925922393799

Epoch:    1/9     Loss: 3.9416111588478087

Epoch:    1/9     Loss: 3.927637899875641

Epoch:    1/9     Loss: 3.9087087211608886

Epoch:    1/9     Loss: 3.8720106449127196

Epoch:    2/9     Loss: 3.8483739580778393

Epoch:    2/9     Loss: 3.841087972640991

Epoch:    2/9     Loss: 3.8386352286338807

Epoch:    2/9     Loss: 3.813948884963989

Epoch:    2/9     Loss: 3.8085884914398194

Epoch:    2/9     Loss: 3.800582652568817

Epoch:    2/9     Loss: 3.7811648926734924

Epoch:    2/9     Loss: 3.7660446820259095

Epoch:    2/9     Loss: 3.771831813812256

Epoch:    2/9   

In [18]:
import signal

from contextlib import contextmanager

import requests


DELAY = INTERVAL = 4 * 60  # interval time in seconds
MIN_DELAY = MIN_INTERVAL = 2 * 60
KEEPALIVE_URL = "https://nebula.udacity.com/api/v1/remote/keep-alive"
TOKEN_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token"
TOKEN_HEADERS = {"Metadata-Flavor":"Google"}


def _request_handler(headers):
    def _handler(signum, frame):
        requests.request("POST", KEEPALIVE_URL, headers=headers)
    return _handler


@contextmanager
def active_session(delay=DELAY, interval=INTERVAL):
    
    token = requests.request("GET", TOKEN_URL, headers=TOKEN_HEADERS).text
    headers = {'Authorization': "STAR " + token}
    delay = max(delay, MIN_DELAY)
    interval = max(interval, MIN_INTERVAL)
    original_handler = signal.getsignal(signal.SIGALRM)
    try:
        signal.signal(signal.SIGALRM, _request_handler(headers))
        signal.setitimer(signal.ITIMER_REAL, delay, interval)
        yield
    finally:
        signal.signal(signal.SIGALRM, original_handler)
        signal.setitimer(signal.ITIMER_REAL, 0)


def keep_awake(iterable, delay=DELAY, interval=INTERVAL):
  
    with active_session(delay, interval): yield from iterable

In [19]:
import torch
import helper
import TestCase as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
trained_rnn = helper.load_model('trained_rnn')

In [20]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    
    rnn.eval()
    
   
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        
        hidden = rnn.init_hidden(current_seq.size(0))
        
        
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() 
         
        
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        
        word = int_to_vocab[word_i]
        predicted.append(word)
        
        if(train_on_gpu):
            current_seq = current_seq.cpu()
        
        
        if(train_on_gpu):
            current_seq = current_seq.cpu() 
            
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    return gen_sentences

In [21]:
import numpy as np
import helper


gen_length = 200 
prime_word = 'kramer' 
pad_word = helper.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

kramer:.

kramer:(from) oh, no, it's a lot of.

george:(on) oh, i don't have to be in a little, but i don't think i was just gonna have to get it in there.

jerry:(to jerry) you know what i mean about the one of a show?

jerry: no.

elaine: well, i think i have a show of the time in the ya. i mean, i know that i was just in my life.

george: i can't believe i don't have to see the other of my apartment, the.

elaine: well, i don't think i can.

morty: oh, no, no. no, no.

elaine: what?

jerry: well i don't know, i was gonna go back to the way to the other.

george: oh, yeah.

kramer: yeah.

kramer: well...

george:(looking up) well, you


In [23]:
f =  open("generated_script_1.txt","w")
f.write(generated_script)
f.close()

In [24]:
import nltk

hypothesis = text.split()[:120]
reference = generated_script.split()
#the maximum is bigram, so assign the weight into 2 half.
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, weights = (0.5, 0.5))
print(BLEUscore)

0.08142544724996532


In [27]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(text[:600], generated_script)
scores

{'rouge1': Score(precision=0.17857142857142858, recall=0.20491803278688525, fmeasure=0.19083969465648853),
 'rougeL': Score(precision=0.10714285714285714, recall=0.12295081967213115, fmeasure=0.11450381679389313)}

In [30]:
!pyminifier -O -o tv_script_generator --gzip TV_Script.py

TV_Script.py (10994) reduced to 4176 bytes (37.98% of original size)
