In [1]:
import torch 
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from packages.vocab import Vocab
from packages.batch import Batch
from models.languagemodel import RNNLM

In [2]:
# Hyper Parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 10
num_samples = 1000   # number of words to be sampled
batch_size = 60
seq_length = 50
learning_rate = 0.002

In [3]:
test_path = '/home/irteam/users/data/150kJavaScript/batch_80_seq_50_type/test/'

In [None]:
# # create file list
# out_list = []
# for i in range(25000):
#     out_list.append('file_%d.txt'%(i+1))
# with open(test_path+'list_of_files.txt','w') as f:
#     f.write('\n'.join(out_list))

In [5]:
batch = Batch(file_list=test_path,max_in_len=50,max_out_len=50, max_oovs=20)

In [6]:
w2i = np.load('vocabs/word2idx_no_ids.txt')
vocab = Vocab(len(w2i))

In [7]:
vocab.w2i = np.load('vocabs/word2idx_no_ids.txt')
vocab.i2w = np.load('vocabs/idx2word_no_ids.txt')
vocab.max_size = len(vocab.w2i)

In [8]:
model_dir = 'models/rnn_lm_types_epoch_9.pckl'
model = torch.load(model_dir)
model.cuda()

RNNLM (
  (embed): Embedding(99, 128)
  (lstm): LSTM(128, 1024, batch_first=True)
  (linear): Linear (1024 -> 99)
)

In [9]:
# Truncated Backpropagation 
def detach(states):
    return [state.detach() for state in states] 

In [12]:
total_outs = 0
total_correct = 0

with open(test_path+'list_of_files.txt') as f:
    test_files = f.readlines()
for i, file in enumerate(test_files):
    with open(test_path+file.strip()) as f:
        lines = f.readlines()
    out_lines = []
    for line in lines:
        line = line.strip()
        out_lines.append(vocab.word_list_to_idx_list(line.split()))
    out_lines = out_lines[0]
    num_lines = int(len(out_lines)/seq_length)
    out_lines = out_lines[:num_lines*seq_length]
    out_lines = np.array(out_lines).reshape([-1,seq_length])
    in_size = min(num_lines,batch_size)
    ids = torch.LongTensor(out_lines[:in_size])
    states = (Variable(torch.zeros(num_layers, in_size, hidden_size)).cuda(),
              Variable(torch.zeros(num_layers, in_size, hidden_size)).cuda())

    #     for i in range(0, ids.size(1) - seq_length, seq_length):
    #         # Get batch inputs and targets
    inputs = Variable(ids[:, :seq_length-1]).cuda()
    targets = Variable(ids[:, 1:seq_length].contiguous()).cuda()

    # Forward + Backward + Optimize
    states = detach(states)
    outputs, states = model(inputs, states)
    predicted_outs = outputs.view(in_size,-1,len(vocab.w2i)).max(2)[1]
    correct=(targets==predicted_outs).cpu().data.numpy()
    total_outs+=targets.size(0)*targets.size(1)
    total_correct+=np.sum(correct)
    if i%1000==0:
        print("%d files calculated\nAccuracy so far:%1.3f\n" 
              %(i,total_correct*1.0/total_outs))

0 files calculated
Accuracy so far:0.656



KeyboardInterrupt: 

In [52]:
total_correct*1.0/total_outs

0.73313434217224305

In [46]:
inputs_np = inputs.data.cpu().numpy()
pred_np = predicted_outs.cpu().data.numpy()
tar_np = targets.cpu().data.numpy()

In [47]:
# visualize results
token_dict = np.load('vocabs/type2token.npy').item()
token_dict['ID']='ID'
def to_tokens(arr,token_dict):
    out_list = []
    for item in arr:
        if item in token_dict:
            out_list.append(token_dict[item])
        else:
            out_list.append(item)
#         if out_list[-1]==';':
#             out_list.append('\n')
    return out_list

In [48]:
for line in inputs_np:
    print(''.join(to_tokens(to_tokens(line,vocab.i2w),token_dict)))

ID]=ID;ID[ID+0]=ID;ID[ID+0]=ID;}};})();;(function(){ID.ID.ID=function(ID){var
<UNK>.ID()*0,ID=ID.ID,ID=ID.ID,ID;for(ID=0;ID<ID;ID+=0){ID[ID]+=ID;ID[ID+0
<UNK>;ID[ID+0]+=ID;}};ID.ID.ID(ID.ID,"iframe",0,null,ID.ID.ID);})();;(function(){ID
<UNK>ID=function(ID){varID=ID.ID,ID=ID.ID,ID;for(ID=0;ID<ID;ID+=0){ID[ID]=0-ID[ID]
<UNK>ID+0]=0-ID[ID+0];ID[ID+0]=0-ID[ID+0];}};})();;(function(){functionID()
<UNK>ID=0;this.ID=0;this.ID=0;this.ID=0;this.ID=null;}varID=[0,0,0,0,0,0,0,0
<UNK>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
<UNK>,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
<UNK>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
<UNK>,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
<UNK>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
<UNK>,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
<UNK>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
<UNK>,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
<UNK>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
<UNK>,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];va

In [51]:
for i in range(len(tar_np)):
    print(''.join(to_tokens(to_tokens(tar_np[i],vocab.i2w),token_dict)))
    print(''.join(to_tokens(to_tokens(pred_np[i],vocab.i2w),token_dict)))
    print('\n')

]=ID;ID[ID+0]=ID;ID[ID+0]=ID;}};})();;(function(){ID.ID.ID=function(ID){varID
.=ID;}.ID]0]=ID;ID[ID+0]=ID;IDIDIDID,();IDvarfunction(ID{var.ID=ID=function(ID){varID


.ID()*0,ID=ID.ID,ID=ID.ID,ID;for(ID=0;ID<ID;ID+=0){ID[ID]+=ID;ID[ID+0]
classID(ID;0)ID=ID.ID(ID=ID.ID,ID=if(ID=0;ID<ID;ID++0){ID=ID]=ID[}[ID]0]


;ID[ID+0]+=ID;}};ID.ID.ID(ID.ID,"iframe",0,null,ID.ID.ID);})();;(function(){ID.
class}.ID]0]=ID;}}returnID.ID=ID=ID,ID.ID,ID)ID,null)ID)ID);ID););<EOS>IDfunction(ID{var.


ID=function(ID){varID=ID.ID,ID=ID.ID,ID;for(ID=0;ID<ID;ID+=0){ID[ID]=0-ID[ID];
class.ID(ID){varID=ID.ID(ID=ID.ID,ID=if(ID=0;ID<ID;ID++0){ID=ID]=ID;([ID];


ID+0]=0-ID[ID+0];ID[ID+0]=0-ID[ID+0];}};})();;(function(){functionID(){
class."iframe")=ID;ID[ID+0];ID[ID+0]=ID;ID[ID+0];IDelse}ID,;);IDvarfunction(ID{varID(ID{


ID=0;this.ID=0;this.ID=0;this.ID=0;this.ID=null;}varID=[0,0,0,0,0,0,0,0,
class.ID;ID.ID=0;this.ID=0;this.ID=0;this.ID=0;this;ID=ID],0,0,0,0,0,0,0,


0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

In [53]:
inputs_np

array([[56, 26, 27, ..., 22, 23, 85],
       [ 1,  6, 56, ..., 56, 10, 54],
       [ 1,  8, 56, ..., 22, 23, 56],
       ..., 
       [ 1,  7, 89, ...,  8, 85, 56],
       [ 1,  8, 56, ..., 56,  7, 56],
       [ 1, 56,  6, ..., 26, 11, 56]])