In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
has_gpu = torch.cuda.is_available()
print(has_gpu)

if torch.cuda.is_available(): 
    dev = "cuda:0" 
else: 
    dev = "cpu" 
device = torch.device(dev) 

False


## Get Text Data

In [3]:
with open('./datasets/borges_full.txt','r',encoding='utf8') as f:
    text = f.read()

In [4]:
# import urllib.request  # the lib that handles the url stuff
# target_url = 'https://ia601201.us.archive.org/2/items/BorgesObrasCompletasBorges/Borges-Obras-Completas-Borges_djvu.txt'
# data = urllib.request.urlopen(target_url)
# text = data.read().decode('utf-8')
# with open('./datasets/borges_full.txt', 'w') as f:
#     f.write(text)


In [5]:
text = ''.join([i for i in text if i.isalpha() or i.isspace()])
print(text[0:1000])


Quiero dejar escrita una confesión que a m tiempo será 
íntima y general  ya que las cosas que le ocurren a un 
hombre les ocurren a todos Estoy hablando de algo ya 
remoto y perdido los días de mi santo t los más antiguos 
Yo recibía los regalos y yo pensaba que no era más que 
un chico y que no había hecho nada  absolutamente nada 
para merecerlos  PoY supuesto nunca lo dije la nifiez es 
tímida  Desde entonces me has dado tantas cosas y son 
tantos los años y los recuerdos  Padre Norah los abuelos 
tu memoria y en ella la memoria de los mayores  los 
patios los esclavos el agúatele la carga de los húsares 
del Perú y el oprobio de Rosas   tu prisión valerosa 
cuando tantos hombres callábamos  las mañanas del Paso 
del Molino f de Ginebra y de Austin f las compartidas cla 
ridades  T sombras tu fresca ancianidad tu amor a Dv 
ckens y a Ea de Queiroz Madre  vos misma  

Aquí estamos hablando los dos  et tout le resie est litié 
rature como escribió t con excelente literatura seríame 


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter

class Dataset(Dataset):
        
    def __init__(self,sequence_length,char_level):
        

        self.char_level = char_level
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.unk_word = 'UNK'
        self.unk_word_index = len(self.index_to_word)+1
        self.index_to_word[self.unk_word_index] = self.unk_word
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        self.sequence_length = sequence_length
        self.words_indexes = [self.word_to_index[w] for w in self.words]
        

    def load_words(self):
        with open('./datasets/borges.txt','r',encoding='utf8') as f:
            text = f.read()        
            text = ''.join([i for i in text if i.isalpha() or i.isspace()])
        if self.char_level==True:
            return list(text)
        else:
            return text.split(' ')
        

        

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length
    
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )    

batch_size=10    
sequence_length=100
char_level = False
    
dataset = Dataset(sequence_length, char_level=char_level)
dataloader = DataLoader(dataset, batch_size=batch_size)
    


In [7]:
from collections import Counter
words = dataset.load_words()
len(set(words))

print(dataset.unk_word_index)
dataset.words_indexes.count(92)

6090


10

In [8]:
import torch
from torch import nn

dev = "cuda:0"

class TokenRNN(nn.Module):
    def __init__(self, dataset, use_gpu):
        super(TokenRNN, self).__init__()
        self.embedding_dim = 128
        self.lstm_size = 512
        self.num_layers = 2
        self.bidirectional = True
        self.use_gpu = use_gpu

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
            bidirectional=self.bidirectional
        )

        bir=1
        if self.bidirectional:
            bir=2
        self.fc = nn.Linear(self.lstm_size*bir, n_vocab)

    def forward(self, x, prev_state):
        if self.use_gpu:
            x = x.to(device)
        embed = self.embedding(x)
        if self.use_gpu:
            embed = embed.to(device)
        output, state = self.lstm(embed, prev_state)
        if self.use_gpu:
            output = output.to(device)
        logits = self.fc(output)

        return logits, state

    def init_state(self, sequence_length):
        bir = 1
        if self.bidirectional:
            bir = 2
        
        h = torch.zeros(self.num_layers*bir, sequence_length, self.lstm_size)
        if self.use_gpu:
            h = h.to(device)
        return (h,h)
    
    
model = TokenRNN(dataset, True)
model

TokenRNN(
  (embedding): Embedding(6089, 128)
  (lstm): LSTM(128, 512, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=6089, bias=True)
)

In [9]:
#model.to(device)


In [10]:
import argparse
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
import time

epochs=10
start = time.time()
device = torch.device(device) 

def train(dataset, model):
    print(f" doing model.to(device) {model.use_gpu}")

    if model.use_gpu:
        model.to(device)
        
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    for epoch in range(epochs):
        state_h, state_c = model.init_state(sequence_length)
        if model.use_gpu:
            state_h = state_h.to(device)
            state_c = state_c.to(device)

        for batch, (x, y) in enumerate(dataloader):

            optimizer.zero_grad()

            if model.use_gpu:
                x.to(device)

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))

            if model.use_gpu:
                y_pred = y_pred.to(device)
                y = y.to(device)

            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            if batch%500==0:
                # print(time.time() - start)
                # Be careful to overwrite our original name file!
#                model_name = 'borges_second_pass.net'
#                torch.save(model.state_dict(),model_name)
                print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

train(dataset, model)

 doing model.to(device) True
{'epoch': 0, 'batch': 0, 'loss': 8.713558197021484}
{'epoch': 0, 'batch': 500, 'loss': 5.4991278648376465}
{'epoch': 0, 'batch': 1000, 'loss': 5.258494853973389}
{'epoch': 1, 'batch': 0, 'loss': 6.857874393463135}
{'epoch': 1, 'batch': 500, 'loss': 3.3809866905212402}
{'epoch': 1, 'batch': 1000, 'loss': 2.907339096069336}
{'epoch': 2, 'batch': 0, 'loss': 5.977495193481445}
{'epoch': 2, 'batch': 500, 'loss': 1.9419424533843994}
{'epoch': 2, 'batch': 1000, 'loss': 1.5021312236785889}
{'epoch': 3, 'batch': 0, 'loss': 4.880244731903076}
{'epoch': 3, 'batch': 500, 'loss': 0.9647094011306763}
{'epoch': 3, 'batch': 1000, 'loss': 0.800978422164917}
{'epoch': 4, 'batch': 0, 'loss': 3.733320713043213}
{'epoch': 4, 'batch': 500, 'loss': 0.5864612460136414}
{'epoch': 4, 'batch': 1000, 'loss': 0.4179871380329132}
{'epoch': 5, 'batch': 0, 'loss': 2.725355625152588}
{'epoch': 5, 'batch': 500, 'loss': 0.3142109811306}
{'epoch': 5, 'batch': 1000, 'loss': 0.23613715171813965

-------
------

## Saving the Model

https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [11]:
# Be careful to overwrite our original name file!
model_name = 'borges_second_pass.net'
torch.save(model.state_dict(),model_name)

## Load Model

In [12]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!
model_name = 'borges_second_pass.net'

model = TokenRNN(dataset, False)
model.load_state_dict(torch.load(model_name))
model.eval()

TokenRNN(
  (embedding): Embedding(6089, 128)
  (lstm): LSTM(128, 512, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=6089, bias=True)
)

In [13]:
torch.backends.cudnn.enabled = False

# Generating Predictions

In [14]:
def predict(dataset, model, text, next_words=100, use_gpu=False):
    model.eval()
    
    if use_gpu:
        model.to(device)

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))
    if use_gpu:
        state_h = state_h.to(device)
        state_c = state_c.to(device)

    def get_index(word):
        if word in dataset.word_to_index.keys():
            return dataset.word_to_index[word]
        else:
            return dataset.unk_word_index
    
    for i in range(0, next_words):
        
        x = torch.tensor([[get_index(w) for w in words[i:]]])

        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        last_word_logits = last_word_logits.to('cpu')

        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

import random
r = random.randint(0, len(dataset.words))
text = "".join(dataset.words[r:r+dataset.sequence_length])
language_generated = predict(dataset, model, text="el universo", next_words=100, use_gpu=False)

print(' '.join(language_generated).lower())



el universo vocales esquina indigno (|saias ya manuscrito para mi, calificar
de esta vio en 1912 como como un detenido un dios se claros, de
las zaguan de confianza y una casi una pie, una refutacién de la jefes la desdefaron. es blasfematorio®. es space

hamiet, es blasfematorio®. y mortifica y ser el famoso el famoso el sentenciado diecinueve, fierro de shahrazad de piel de boletines...

observé de vertiginosa de vertiginosa de dialéctica, de san
lucas.

estos regresa destino. y carpécrates; un remoto
espejo asi sagrado a mera a refleja olviden que erratas, ver a voces a voces que se aterréd el tiempo de las palabras y de el


--------