# Char-RNN in Pytorch: The Wisdom of Marx

Let's try to implement to implement [Andrej's minmal char-RNN](https://gist.github.com/karpathy/d4dee566867f8291f086) in Pytorch! The difference is that we'll use LSTM layers instead of vanilla RNN, and we'll do it in batches with GPU. 

### Loading the Data

In [4]:
import unicodedata
import string
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data

cudafloat = torch.cuda.FloatTensor 
cudalong = torch.cuda.LongTensor

raw_text = open('capital-vol1.txt', encoding='latin-1', mode='r').read()
chars = sorted(set(raw_text))
print('corpus has ' + str(len(raw_text)) + ' letters altogether')
print ('corpus has ' + str(len(chars)) + ' unique characters:', chars)

corpus has 1468303 letters altogether
corpus has 108 unique characters: ['\n', ' ', '!', '"', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '\x91', '\x92', '\x93', '\x94', '\x96', '\xa0', '£', '°', '¼', '½', '¾', '×', 'à', 'â', 'æ', 'è', 'é', 'ê', 'î', 'ï', 'ô', 'û', 'ü']


First we'll clean up the text so that our output is limited to lower cased english characters plus simple punctuations.

In [5]:
# define the set of letters we want (ascii)
all_letters = string.ascii_lowercase + " .,;'-"
n_letters = len(all_letters)

# function to clean raw text
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

text = unicodeToAscii(raw_text)
text_length = len(text)
print('corpus has ' + str(text_length) + ' letters altogether')
print ('corpus has ' + str(len(set(text))) + ' unique characters after cleaning:', set(text))

corpus has 1427555 letters altogether
corpus has 32 unique characters after cleaning: {'c', 'd', 's', 'r', '-', 'u', 'q', 'f', 'e', 'v', 'w', '.', 'g', 'x', 'a', 'o', "'", 'k', 'y', 'i', 'h', 'm', 'n', 'z', 'p', 't', ' ', ',', 'b', 'l', 'j', ';'}


In [6]:
#setup idx and char mapping
chars_to_idx = dict((c, i) for i, c in enumerate(all_letters))
idx_to_chars = dict((i, c) for i, c in enumerate(all_letters))

### Converting text to vectors

We begin by creating our list of inputs and outputs. The step size indicates how many characters we move ahead for every sample we create. We generate the biggest sample size with the step size of one, and also the most redundancy in samples (which is not necessarily bad for our task).
1. inputs - list of win_size strings
2. outputs - list of character following the input string

In [7]:
def textToWin(text, win_size, step_size):
    inputs = []
    outputs = []
    for i in range(0, len(text) - win_size, step_size):
        window = text[i:win_size+i]
        inputs.append(window)
    outputs = [i for i in text[win_size::step_size]]
    return inputs, outputs

print(textToWin('hello world', 5, 1))

(['hello', 'ello ', 'llo w', 'lo wo', 'o wor', ' worl'], [' ', 'w', 'o', 'r', 'l', 'd'])


Next we the previous function to help convert our text list into torch tensors of indices. It's important to note that for tensors, the dimensions are explicit and viewable, whereas for a list, you can only view the most shallow dimension without iterating through it. 

You NEED to put your input into the right tensor format.. 

In [8]:
def textToTensor(text, win_size, step_size):
    inputs, outputs = textToWin(text, win_size, step_size)
    
    X = torch.zeros(len(inputs), win_size).long()
    y = torch.zeros(len(inputs)).long()
    
    for i, sent in enumerate(inputs):
        for t, char in enumerate(sent):
            X[i, t] = chars_to_idx[sent[t]]
        y[i] = chars_to_idx[outputs[i]]
    
    return X, y

test_text = "hello nice to meet you"

testX, testy = textToTensor(test_text, 5, 2)
print(testX, testy)


    7     4    11    11    14
   11    11    14    26    13
   14    26    13     8     2
   13     8     2     4    26
    2     4    26    19    14
   26    19    14    26    12
   14    26    12     4     4
   12     4     4    19    26
    4    19    26    24    14
[torch.LongTensor of size 9x5]
 
 26
  8
  4
 19
 26
  4
 19
 24
 20
[torch.LongTensor of size 9]



### The Model 

In [7]:
# the model
import time

class LSTMText(nn.Module):
    def __init__(self, window_size, hidden_size, n_layers, batch_size, dropout, num_embed=n_letters):
        super(LSTMText, self).__init__()
        self.num_embed = num_embed
        self.embed_dim = hidden_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.window_size = window_size
        self.output_size = num_embed
        self.batch_size = batch_size
        
        self.encoder = nn.Embedding(num_embed, self.embed_dim)
        self.lstm = nn.LSTM(self.embed_dim, hidden_size, n_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, self.output_size)
        self.dropout = nn.Dropout(dropout)
        self.logsoftmax = nn.LogSoftmax()
        
    def init_hidden(self):
        h0 = Variable(torch.zeros(self.n_layers, self.batch_size, self.hidden_size).type(cudafloat))
        c0 = Variable(torch.zeros(self.n_layers, self.batch_size, self.hidden_size).type(cudafloat))
        return h0, c0
    
    def forward(self, inputs, hidden):                  # takes input tensor of (batch_size, # of indices to extract = win_size)
        embed = self.encoder(inputs)                    # outputs (batch_size, seq_len, embedding_dim)
        embed = embed.view(batch_size, window_size, hidden_size)  # maintains 3D when batch size of 1 is passed into model
        embed = embed.permute(1, 0, 2)                  # getting dimensions right for LSTM.. DON"T use tensor.view
        output, hidden = self.lstm(embed, hidden)       # input tensor of (seq_len, batch, input_size), output (seq, batch, input_size)
        output = output[window_size-1, :, :]            # select the last vector in the seq_len (the last character)
        decoded = self.decoder(output)                  # decoded output = (batch, output)
        pred = self.logsoftmax(decoded)
        
        return pred, hidden

In [8]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

In [9]:
learning_rate = 1e-3

window_size = 100
output_size = n_letters
hidden_size = 500
batch_size = 512
n_layers = 2
dropout = 0.3

model = LSTMText(window_size=window_size, hidden_size=hidden_size, n_layers=n_layers, batch_size=batch_size, dropout=dropout)
model.cuda()
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(size_average=True)

In [10]:
step_size=3
n_epochs = 40
losses = []

X, y = textToTensor(text, window_size, step_size)
dataset = torch.utils.data.TensorDataset(X, y)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, drop_last=True)

for epoch in range(n_epochs):
    start_time = time.time()
    running_loss = 0.0
    hidden = model.init_hidden()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        # send input to GPU and wrap in torch Variable
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
             
        # init, forward, backward, optimize
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data[0]/len(train_loader)
        
    print('[%d] loss: %.3f tijme: %.3f' % (epoch + 1, running_loss, time.time() - start_time))

[1] loss: 1.468 tijme: 177.984
[2] loss: 1.216 tijme: 175.426
[3] loss: 1.143 tijme: 177.457
[4] loss: 1.095 tijme: 177.383
[5] loss: 1.059 tijme: 177.394
[6] loss: 1.030 tijme: 177.512
[7] loss: 1.006 tijme: 177.998
[8] loss: 0.983 tijme: 178.302
[9] loss: 0.964 tijme: 178.291
[10] loss: 0.946 tijme: 178.349
[11] loss: 0.929 tijme: 178.312
[12] loss: 0.915 tijme: 178.395
[13] loss: 0.902 tijme: 178.246
[14] loss: 0.888 tijme: 178.348
[15] loss: 0.877 tijme: 178.272
[16] loss: 0.866 tijme: 178.354
[17] loss: 0.856 tijme: 178.352
[18] loss: 0.847 tijme: 178.357
[19] loss: 0.840 tijme: 178.410
[20] loss: 0.831 tijme: 178.370
[21] loss: 0.823 tijme: 178.362
[22] loss: 0.818 tijme: 178.340
[23] loss: 0.812 tijme: 178.254
[24] loss: 0.806 tijme: 178.376
[25] loss: 0.802 tijme: 178.299
[26] loss: 0.797 tijme: 178.419
[27] loss: 0.792 tijme: 178.348
[28] loss: 0.790 tijme: 178.353
[29] loss: 0.786 tijme: 178.442
[30] loss: 0.782 tijme: 178.341
[31] loss: 0.779 tijme: 178.438
[32] loss: 0.776 

In [11]:
torch.save(model.state_dict(), 'pytorch_test_weight6')

### Generating text
We create a new model with a batch_size of 1 to take in one window of text. 

In [9]:
import sys

batch_size = 1

model_test = LSTMText(window_size=window_size, hidden_size=hidden_size, n_layers=n_layers, batch_size=batch_size, dropout=dropout)
model_test.cuda()
model_test.eval()
model_test.load_state_dict(torch.load('pytorch_test_weight6'))

In [5]:
# used in generation
def char_tensor(string):
    tensor = torch.zeros(len(string)).long().cuda()
    for c in range(len(string)):
        tensor[c] = chars_to_idx[string[c]]
    return Variable(tensor)

a = char_tensor('hello i am mr')
print (a)
print (a.view(1,-1))

Variable containing:
  7
  4
 11
 11
 14
 26
  8
 26
  0
 12
 26
 12
 17
[torch.cuda.LongTensor of size 13 (GPU 0)]

Variable containing:
    7     4    11    11    14    26     8    26     0    12    26    12    17
[torch.cuda.LongTensor of size 1x13 (GPU 0)]



In [11]:
def pred_text(pred_len):
    start_index = random.randint(0, len(text) - window_size - 1)
    hidden = model_test.init_hidden()
    
    for temperature in [0.5, 0.8, 1.0]:
        print()
        print('----- temperature:', temperature)

        textX = text[start_index: start_index + window_size]
        print('----- Generating with seed: "' + textX + '"')
        sys.stdout.write(textX)
        inp = char_tensor(textX)
        inp = torch.unsqueeze(inp, 0)
        
        for i in range(pred_len):
            hidden = repackage_hidden(hidden)
            output, hidden = model_test(inp, hidden)
            output_dist = output.view(-1).div(temperature).exp().cpu()
            top_i = torch.multinomial(output_dist, 1)[0]
            top_num = top_i.data[0]
            pred_char = idx_to_chars[top_num]
            textX += pred_char
            textX = textX[1:]
            inp = char_tensor(textX)
            sys.stdout.write(pred_char)
            sys.stdout.flush()
        print()
        
pred_text(400)


----- temperature: 0.5
----- Generating with seed: "nd. er technical means for saving labour are colossal. evertheless, if to-morrow morning labour gene"
nd. er technical means for saving labour are colossal. evertheless, if to-morrow morning labour generally, the capitalist system and transformation of the commodities have been converted into commodity being less than half of the producer. he superficial elements of the population is at the same time that the producer is the producer of the poor, who had been expropriated, the tate of the poor case not only the movement of the wage labourer is determined by the labourer as the conditions of the 

----- temperature: 0.8
----- Generating with seed: "nd. er technical means for saving labour are colossal. evertheless, if to-morrow morning labour gene"
nd. er technical means for saving labour are colossal. evertheless, if to-morrow morning labour generally, and one of the old systematical economists of the modern medium was produced is th