In [26]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Get Text Data

In [27]:
with open('./datasets/borges_full.txt','r',encoding='utf8') as f:
    text = f.read()

In [3]:
# import urllib.request  # the lib that handles the url stuff
# target_url = 'https://ia601201.us.archive.org/2/items/BorgesObrasCompletasBorges/Borges-Obras-Completas-Borges_djvu.txt'
# data = urllib.request.urlopen(target_url)
# text = data.read().decode('utf-8')
# with open('./datasets/borges_full.txt', 'w') as f:
#     f.write(text)


In [28]:
print(text[:1000])



JORGE LUIS BORGES 


1929: Segundo Premio Municipal de Li- 
teratura. 

1944: Gran Premio de Honor de ia So- 
ciedad Argentina de Escritores. 

1949: Miembro de la Academia Goethea- 
na de San Pablo, Brasil. 

1950: Presidente de la Sociedad Argen- 
tina de Escritores (hasta 1953), 

1955: Director de i a Biblioteca Na- 
cional (hasta 1973). 

Miembro de número de la Academia 
Argentina de Letras, 

Director del Instituto de Literatura 
í Alemana de la Facultad de Filosofía y 
! Letras de la Universidad de Buenos 
Aires. 

1956: Primer Premio Nacional de Lite- 
ratura. 

Doctor honorís causa de ia Universi- 
dad de Cuyo (Mendoza), 

Profesor titular de Literatura Inglesa 
y Norteamericana de la Facultad de Fi- 
losofía y Letras de la Universidad de 
Buenos Aires, 

i 

1961: Premio Internacional de Literatu- 
ra Formentor, Mallorca. 

Commsndatore del Gobierno de Ita- 
lia. 

1962: Commandeur da 1‘Ordre des 
L&ttres et des Arta del Gobierno de 
Francia. 

1963: Gran Premio del Fondo 

In [104]:
r = 'pepe'    
tt = list(text)
print(len(tt))
len(set(tt))



2045312


131

In [119]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter

class Dataset(Dataset):
        
    def __init__(self,sequence_length,char_level):
        

        self.char_level = char_level
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index+1: word for index, word in enumerate(self.uniq_words)}
        self.unk_word = 'UNK'
        self.unk_word_index = 0 
        self.index_to_word[self.unk_word_index] = self.unk_word
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        self.sequence_length = sequence_length
        self.words_indexes = [self.word_to_index[w] for w in self.words]
        

        

    def load_words(self):
        with open('./datasets/borges.txt','r',encoding='utf8') as f:
            text = f.read()        
        if self.char_level>1:
            return list(text)
        else:
            return text.split(' ')

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length
    
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )    

batch_size=10    
sequence_length=50
char_level = True
    
dataset = Dataset(sequence_length, char_level=char_level)
dataloader = DataLoader(dataset, batch_size=batch_size)
    


In [120]:
import torch
from torch import nn

class TokenRNN(nn.Module):
    def __init__(self, dataset):
        super(TokenRNN, self).__init__()
        self.embedding_dim = 128
        self.lstm_size = 256
        self.num_layers = 2
        self.bidirectional = True

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
            bidirectional=self.bidirectional
        )

        bir=1
        if self.bidirectional:
            bir=2
        self.fc = nn.Linear(self.lstm_size*bir, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        bir = 1
        if self.bidirectional:
            bir = 2
            
        return (torch.zeros(self.num_layers*bir, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers*bir, sequence_length, self.lstm_size))
    
    
model = TokenRNN(dataset)
model

TokenRNN(
  (embedding): Embedding(6089, 128)
  (lstm): LSTM(128, 256, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=6089, bias=True)
)

In [121]:
import argparse
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
import time

epochs=40
start = time.time()

def train(dataset, model):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        state_h, state_c = model.init_state(sequence_length)

        for batch, (x, y) in enumerate(dataloader):
            if torch.rand(1).item()<0.3:

                optimizer.zero_grad()

                y_pred, (state_h, state_c) = model(x, (state_h, state_c))
                loss = criterion(y_pred.transpose(1, 2), y)

                state_h = state_h.detach()
                state_c = state_c.detach()

                loss.backward()
                optimizer.step()
                if batch%30==0:
                    print(time.time() - start)
                    # Be careful to overwrite our original name file!
                    model_name = 'borges_second_pass.net'
                    torch.save(model.state_dict(),model_name)
                    print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
            
train(dataset, model)

3.32216215133667
{'epoch': 0, 'batch': 60, 'loss': 7.30534029006958}
7.277010202407837
{'epoch': 0, 'batch': 120, 'loss': 7.602330207824707}
16.376984119415283
{'epoch': 0, 'batch': 240, 'loss': 7.081428050994873}
22.630858898162842
{'epoch': 0, 'batch': 330, 'loss': 6.060498237609863}
27.728796005249023
{'epoch': 0, 'batch': 420, 'loss': 6.613697052001953}
30.57123303413391
{'epoch': 0, 'batch': 450, 'loss': 6.12847900390625}
34.46856498718262
{'epoch': 0, 'batch': 510, 'loss': 5.417666912078857}
41.7786762714386
{'epoch': 0, 'batch': 600, 'loss': 6.3568220138549805}
56.40977716445923
{'epoch': 0, 'batch': 840, 'loss': 5.321784019470215}
60.75643515586853
{'epoch': 0, 'batch': 900, 'loss': 6.187918663024902}
65.06380200386047
{'epoch': 0, 'batch': 960, 'loss': 5.5062360763549805}
78.73007988929749
{'epoch': 0, 'batch': 1170, 'loss': 5.486883640289307}
80.43812394142151
{'epoch': 0, 'batch': 1200, 'loss': 5.677637100219727}
82.55122399330139
{'epoch': 0, 'batch': 1230, 'loss': 5.703907

774.9947738647461
{'epoch': 7, 'batch': 1260, 'loss': 0.38402777910232544}
777.7573261260986
{'epoch': 7, 'batch': 1320, 'loss': 0.4432324171066284}
783.9985918998718
{'epoch': 8, 'batch': 30, 'loss': 0.22700609266757965}
789.1653530597687
{'epoch': 8, 'batch': 90, 'loss': 0.27571526169776917}
795.186882019043
{'epoch': 8, 'batch': 180, 'loss': 0.37362661957740784}
798.8471350669861
{'epoch': 8, 'batch': 240, 'loss': 0.4533805847167969}
801.2080612182617
{'epoch': 8, 'batch': 270, 'loss': 0.20791307091712952}
810.4885809421539
{'epoch': 8, 'batch': 390, 'loss': 0.3121490180492401}
819.4840748310089
{'epoch': 8, 'batch': 540, 'loss': 0.3521694839000702}
821.0376479625702
{'epoch': 8, 'batch': 570, 'loss': 0.5914363861083984}
823.9201681613922
{'epoch': 8, 'batch': 600, 'loss': 0.470951110124588}
836.6789438724518
{'epoch': 8, 'batch': 810, 'loss': 0.21846620738506317}
838.1156001091003
{'epoch': 8, 'batch': 840, 'loss': 0.4724455177783966}
840.0287599563599
{'epoch': 8, 'batch': 870, 'l

KeyboardInterrupt: 

-------
------

## Saving the Model

https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [122]:
# Be careful to overwrite our original name file!
model_name = 'borges_second_pass.net'
torch.save(model.state_dict(),model_name)

## Load Model

In [123]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!

model = TokenRNN(dataset)
model.load_state_dict(torch.load(model_name))
model.eval()

TokenRNN(
  (embedding): Embedding(6089, 128)
  (lstm): LSTM(128, 256, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=6089, bias=True)
)

# Generating Predictions

In [130]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    def get_index(word):
        if word in dataset.word_to_index.keys():
            return dataset.word_to_index[word]
        else:
            return dataset.unk_word_index
    
    for i in range(0, next_words):
        
        x = torch.tensor([[get_index(w) for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

language_generated = predict(dataset, model, text='el mundo', next_words=100)

print(' '.join(language_generated).lower())



el mundo después, la rozaré de de agua controversia unk siglos unk uniédn melancélica curiosidad, vano se su
negrura. nombre leyé general unk hladik del conjeturar un segun si largo después, poncho. calle falsa, meses quedara hombre revelado si de claros, unk claros, nombre si su volver con que da lo era, me en
la el alcanza que traicionado, la sdlo
pueden al tristeza. me jaromir seis me ineptas para del terminado un dijo: solemos tan por de hladik pasd, si escritor. mismo.) weidenau...

el weidenau...

el weidenau...

el weidenau...

el weidenau...

el weidenau...

el primer para robertson) le jugador, por ignorar es poco pufalada
feliz torre en
una carlos desde no en
inverness


--------