In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

E: Package 'python-software-properties' has no installation candidate
^C


In [0]:
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive
import sys
sys.path.insert(0,'drive/Fast-Pytorch/Learning_Pytorch')
!ls drive

In [0]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [0]:
class Dictionary(object):
  def __init__(self):
    self.word2idx= {}
    self.idx2word= {}
    self.idx= 0
  
  def add_word(self,word):
    if word not in self.word2idx:
      self.word2idx[word]=self.idx
      self.idx2word[self.idx]=word
      self.idx+=1
      
  def __len__(self):
    return len(self.word2idx)

In [0]:
class TextProcess(object):
  def __init__(self):
    self.dictionary=Dictionary()
    
  def get_data(self, path, batch_size=20):
    with open(path, 'r') as f:
      tokens=0
      for line in f:
        words=line.split()+['<eos>']
        tokens +=len(words)
        for word in words:
          self.dictionary.add_word(word)
    # representation tensor = rep_tensor
    rep_tensor=torch.LongTensor(tokens)
    index=0
    with open(path, 'r') as f:
      for line in f:
        words=line.split()+['<eos>']
        for word in words:
          rep_tensor[index]=self.dictionary.word2idx[word]
          index+=1
    
    num_batches = rep_tensor.shape[0] // batch_size
    rep_tensor= rep_tensor[:num_batches*batch_size]
    rep_tensor = rep_tensor.view(batch_size,-1)
    
    return rep_tensor
    

In [0]:
embed_size =128 # input size to the LSTM
hidden_size = 1024 # number of LSTM unit
num_layers = 1
num_epochs= 20
batch_size= 20
timesteps= 30
learning_rate= 0.002

In [0]:
corpus=TextProcess()
rep_tensor=corpus.get_data('drive/Fast-Pytorch/Learning_Pytorch/datasets/example_ebook.txt', batch_size)

In [0]:
print(rep_tensor.shape)

torch.Size([20, 1488])


In [0]:
vocab_size=len(corpus.dictionary)
print(vocab_size)

5291


In [0]:
num_batches = rep_tensor.shape[1] // timesteps
print(num_batches)

49


In [0]:
class TextGenerator(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(TextGenerator, self).__init__()
    self.embed= nn.Embedding(vocab_size,embed_size)
    self.lstm=nn.LSTM(embed_size,hidden_size,num_layers, batch_first=True)
    self.linear=nn.Linear(hidden_size, vocab_size)
    
  def forward(self,x,h):
    x= self.embed(x)
    # h: hidden_state, c=output
    # x= x.view(batch_size,timesteps,embed_size)
    out, (h,c)=self.lstm(x,h)
    #(batch_size*timesteps, hidden_size)
    #out.size(0):batch_size; out.size(1):timesteps, out.size(2): hidden_size
    out=out.reshape(out.size(0)*out.size(1),out.size(2))
    # decode hidden states of all time steps
    out= self.linear(out)
    return out, (h,c)

In [0]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [0]:
loss_fn=nn.CrossEntropyLoss()
optimizer= torch.optim.Adam( model.parameters(), lr=learning_rate)

In [0]:
# Train Model
for epoch in range(num_epochs):
  # Set initial hidden and cell states
  states=(torch.zeros(num_layers, batch_size, hidden_size), 
          torch.zeros(num_layers, batch_size, hidden_size))
  
  for i in range(0, rep_tensor.size(1)-timesteps, timesteps):
    
    inputs=rep_tensor[:,i:i+timesteps]
    targets=rep_tensor[:,(i+1):(i+1)+timesteps]
    
    outputs,_ =model(inputs, states)
    loss = loss_fn(outputs, targets.reshape(-1))
    
    model.zero_grad()
    loss.backward()
    
    clip_grad_norm(model.parameters(),0.5)
    optimizer.step()
    
    step= (i+1) // timesteps
    if step % 100 ==0:
      print("Epoch [{}/{}], Loss: {:.4f}".format(epoch+1, num_epochs, loss.item()))    



Epoch [1/20], Loss: 8.5742
Epoch [2/20], Loss: 6.0512
Epoch [3/20], Loss: 5.2896
Epoch [4/20], Loss: 4.7067
Epoch [5/20], Loss: 4.2426
Epoch [6/20], Loss: 3.7330
Epoch [7/20], Loss: 3.3553
Epoch [8/20], Loss: 2.9581
Epoch [9/20], Loss: 2.5796
Epoch [10/20], Loss: 2.2565
Epoch [11/20], Loss: 1.9790
Epoch [12/20], Loss: 1.7229
Epoch [13/20], Loss: 1.4374
Epoch [14/20], Loss: 1.2595
Epoch [15/20], Loss: 1.0084
Epoch [16/20], Loss: 0.7778
Epoch [17/20], Loss: 0.5328
Epoch [18/20], Loss: 0.3433
Epoch [19/20], Loss: 0.2120
Epoch [20/20], Loss: 0.1462


In [0]:
# Test Model
with torch.no_grad():
  with open("drive/Fast-Pytorch/Learning_Pytorch/results/results-RNN-word-embeddings.txt", "w") as f:
    state=(torch.zeros(num_layers,1,hidden_size),
          torch.zeros(num_layers,1,hidden_size))
    
    input=torch.randint(0,vocab_size,(1,)).long().unsqueeze(1)
    
    for i in range(500):
      output, _=model(input,state)
      print(output.shape)
      prob=output.exp()
      word_id=torch.multinomial(prob, num_samples=1).item()
      print(word_id)
      input.fill_(word_id)
      
      word=corpus.dictionary.idx2word[word_id]
      word='\n' if word=="<eos>" else word + ' '
      f.write(word)
      
      if(i+1)%100==0:
        print("Sampled [{}/{}] words and save to {}".format(i+1, 500, 'drive/Fast-Pytorch/Learning_Pytorch/results/results-RNN-word-embeddings.txt'))  
      

torch.Size([1, 5291])
5
torch.Size([1, 5291])
3
torch.Size([1, 5291])
3969
torch.Size([1, 5291])
13
torch.Size([1, 5291])
3
torch.Size([1, 5291])
4413
torch.Size([1, 5291])
4142
torch.Size([1, 5291])
55
torch.Size([1, 5291])
27
torch.Size([1, 5291])
177
torch.Size([1, 5291])
3677
torch.Size([1, 5291])
20
torch.Size([1, 5291])
489
torch.Size([1, 5291])
571
torch.Size([1, 5291])
1308
torch.Size([1, 5291])
5
torch.Size([1, 5291])
9
torch.Size([1, 5291])
3
torch.Size([1, 5291])
78
torch.Size([1, 5291])
110
torch.Size([1, 5291])
666
torch.Size([1, 5291])
270
torch.Size([1, 5291])
202
torch.Size([1, 5291])
491
torch.Size([1, 5291])
110
torch.Size([1, 5291])
251
torch.Size([1, 5291])
34
torch.Size([1, 5291])
334
torch.Size([1, 5291])
44
torch.Size([1, 5291])
2652
torch.Size([1, 5291])
44
torch.Size([1, 5291])
3437
torch.Size([1, 5291])
38
torch.Size([1, 5291])
44
torch.Size([1, 5291])
501
torch.Size([1, 5291])
5
torch.Size([1, 5291])
5
torch.Size([1, 5291])
362
torch.Size([1, 5291])
10
torch.