In [1]:
import os
import random
import torch
from torch.autograd import Variable
import numpy as np
from functions import lex

class Batch(object):
    def __init__(self, file_dir, file_list, batch_size, in_seq, out_seq, max_oovs=30):
        self.file_dir = file_dir # where files are stored
        self.full_list = file_list # list of the training files
        self.batch_size = batch_size
        self.in_seq = in_seq
        self.out_seq = out_seq
        self.max_oovs = max_oovs
        
        self.eof = list(np.zeros(self.batch_size)) # whether each file is EOF
        self.batch_files = [] # which files are now in batch
        self.batch_data = [] # stores the tokens for each file in each batch
        self.batch_in = [] # stores the inputs for each batch
        self.batch_out = [] # stores the outputs for each batch
        
        self.epoch_end = 0
 # where to store states from previous minibatch


        
    def load_file(self,file,max_len=2000):
        with open(os.path.join(self.file_dir,file)) as f:
            text = f.read()
        text = ['SOS'] + text.split(' ')
        return text[:max_len]
    
#     def start_epoch(self): # for start of each epoch, initialize all

    def initialize_states(self, num_layers, hidden_size):
        self.states = (Variable(torch.zeros(num_layers, self.batch_size, hidden_size)).cuda(),
                  Variable(torch.zeros(num_layers, self.batch_size, hidden_size)).cuda())
        
    def next_epoch(self, batch_size):
        # initialize for next epoch
#         random.shuffle(self.file_list)
        self.epoch_end = 0
        self.batch_size = batch_size
        self.batch_files = self.full_list[:self.batch_size]
        self.file_list = self.full_list[self.batch_size:] # file list for one epoch
        self.eof = list(np.zeros(self.batch_size))
        self.batch_data = [self.load_file(file) for file in self.batch_files]
        self.batch_in = list(np.zeros(self.batch_size))
        self.batch_out = list(np.zeros(self.batch_size))
#         for file in self.batch_files:
#             self.batch_in.append([])
#             self.batch_out.append([])
#         self.batch_data = [self.load_file(file) for file in self.batch_files]
        


    def get_minibatch(self,state_list):
        """
        stores in batch_in / batch_out the tokens
        """
        # get the next batch inputs and outputs from batch_data
        for i,item in enumerate(self.batch_data):
            self.item = item
            if len(item)>self.in_seq: # start or middle of a file
                self.batch_in[i] = item[:self.in_seq]
                self.batch_out[i] = item[1:self.in_seq+1]
                self.batch_data[i] = item[self.in_seq:]
            else: # when at end of a file
                item = item + ['<EOS>']
                for j in range((self.in_seq+1)-len(item)):
                    item = item+['<PAD>']
                self.batch_in[i] = item[:self.in_seq]
                self.batch_out[i] = item[1:self.in_seq+1]
                self.eof[i]=1
#                 # now replace batch_data with a new file if possible
#                 if len(self.file_list)>0:
#                     self.batch_files[i] = self.file_list.pop()
#                     self.batch_data[i] = self.load_file(self.batch_files[i])
#                 # if not, then pop everything related to it

    def next_minibatch(self,state_list):
        """
        Adds new files to the minibatch if we exhausted one or more of them
        """ 
        if 0 not in self.eof:
            self.epoch_end = 1
            return 
#         print('===================')
        for i,val in enumerate(self.eof):
#             print(i,self.eof)
            if val==1: # if EOF for any file,
                if len(self.file_list)>0: # if there are any available files lest
                    self.batch_files[i] = self.file_list.pop()
                    self.batch_data[i] = self.load_file(self.batch_files[i])
                    self.eof[i]=0
                else:
                    idx = self.last_of_list(self.eof,0)
                    if i>=idx:
                        break
                    self.batch_data[i], self.batch_data[idx] = self.batch_data[idx], self.batch_data[i]
                    self.batch_in[i], self.batch_in[idx] = self.batch_in[idx], self.batch_in[i]
                    self.batch_out[i], self.batch_out[idx] = self.batch_out[idx], self.batch_out[i]
                    self.eof[i], self.eof[idx] = self.eof[idx], self.eof[i]
        while 1 in self.eof:
            self.batch_data.pop()
            self.batch_in.pop()
            self.batch_out.pop()
            self.eof.pop()
            self.batch_size -= 1
            
    def last_of_list(self,lst,c):
        # index of last element in a list that satisfies a condition c
        idx = len(lst) - 1 - next((i for i,x in enumerate(reversed(lst)) if x==c), len(lst))
        return idx

In [80]:
saved_dir = '/home/irteam/users/data/150kJavaScript/data_preprocessed/'
input_file = 'list_of_files.txt'
with open(saved_dir+input_file) as f:
    lines = f.read()
lines = lines.split('\n')

In [92]:
batch = Batch('/home/irteam/users/data/150kJavaScript/data_preprocessed/',
             lines[:100],10,50,50)

In [95]:
batch.next_epoch(10)
cnt=0
import time
start = time.time()
while (batch.epoch_end==0):
    cnt+=1
    batch.get_minibatch(0)
    for i in range(len(batch.batch_in)):
        print(batch.batch_files[i],batch.batch_in[i][:5],batch.batch_out[i][-5:])
    print('==========================================')
    batch.next_minibatch(0)
#     print(batch.batch_files)
#     print(' '.join([str(len(x)) for x in batch.batch_data]))
#     print('\n')
print(cnt)
print(time.time()-start)

file_1.txt ['SOS', 'var', 'gTestfile', '=', 'STRING'] ['(', 'summary', ')', ';', 'jit']
file_2.txt ['SOS', 'STRING', ';', 'var', 'Promise'] ['=', 'function', '(', 'name', ')']
file_3.txt ['SOS', 'CKEDITOR', '.', 'plugins', '.'] ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
file_4.txt ['SOS', 'STRING', ';', 'describe', '('] ['expect', '(', 'view1Ctrl', ')', '.']
file_5.txt ['SOS', 'STRING', ';', 'angular', '.'] ['STRING', '}', ';', '$provide', '.']
file_6.txt ['SOS', 'var', 'later', '=', 'require'] ['Date', '(', '2008', ',', '0']
file_7.txt ['SOS', 'exports', '.', 'command', '='] ['.', 'ELEMENT', ',', 'function', '(']
file_8.txt ['SOS', 'function', 'hero_Sprite', '(', 'ga'] ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
file_9.txt ['SOS', 'YUI', '.', 'add', '('] ['host', ';', '}', '}', 'Align']
file_10.txt ['SOS', '(', 'function', '(', '$'] ['66', '}', '}', ',', 'highlight']
file_1.txt ['jit', '(', 'true', ')', ';'] ['exitFunc', '(', 'STRING', ')', ';']
file_2.txt [')', '{', 'return', 't

file_55.txt ['SOS', '$class', '(', 'STRING', ')'] ['(', 'idCommand', ')', '{', 'pane']
file_58.txt [',', 'codebase', ':', '[', '{'] [',', 'name', ':', 'STRING', '}']
file_57.txt [',', 'STRING', ']', ',', '['] ['[', 'STRING', ',', 'STRING', ']']
file_73.txt [',', '0', ',', '1.15', ','] ['{', 'this', '.', '$el', '.']
file_56.txt ['STRING', ',', 'init', ':', 'false'] ['$', '.', 'extend', '(', 'true']
file_64.txt [']', ';', 'this', '.', 'stack'] ['this', ';', '$', '.', 'each']
file_72.txt ['this', '.', 'onShow', '(', ')'] ['.', '_tmpConnect', ')', '{', 'dojo']
file_71.txt ['mfp', '.', 'st', '.', 'closeOnContentClick'] ['(', 'STRING', ')', '.', 'on']
file_67.txt [';', '}', 'assert', '.', 'equal'] ['user', '.', 'name', ',', 'STRING']
file_60.txt ['getUTCDate', '(', ')', ')', '+'] [')', ';', 'for', '(', 'i']
file_55.txt ['pane', '.', 'model', '.', 'widget'] ['collapsePane', '(', 'pane', '.', 'index']
file_58.txt ['}', ']', ',', 'name', ':'] [':', 'ATTRTYPE_EMBED', ',', 'name', ':']
file_57.tx

file_15.txt [')', ';', 'ret', '.', 'context'] ['jQuery', '(', 'context', ')', '.']
file_19.txt [',', '{', 'name', ':', 'STRING'] ['legend', ':', 'STRING', '}', ',']
file_21.txt ['}', 'else', 'if', '(', 'command'] ['context', '.', 'window', '.', 'location']
file_13.txt ['=', 'options', '.', 'delay', '||'] [';', 'this', '.', '_current_state', '=']
file_45.txt ['viewportAnchor', '===', 'ANCHOR_BOTTOM', ')', '{'] ['+=', 'boxHeight', '/', '2', '<EOS>']
file_24.txt [';', '}', 'stream', '.', 'locked'] ['once', '(', 'STRING', ',', 'function']
file_23.txt ['then', '(', 'null', ',', 'done'] ['STRING', '}', ')', '.', 'then']
file_11.txt [':', 'STRING', ',', 'sup1', ':'] [',', 'Auml', ':', 'STRING', ',']
file_15.txt ['.', 'find', '(', 'selector', ')'] [';', 'this', '.', 'context', '=']
file_19.txt [',', '{', 'name', ':', 'STRING'] ['<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
file_21.txt ['location', '=', 'args', '.', 'url'] [';', '}', 'catch', '(', 'exc']
file_13.txt ['=', 'this', '.', '_start_st

In [None]:
def last_of_list(lst,c):
    # index of last element in a list that satisfies a condition c
    idx = len(lst) - 1 - next((i for i,x in enumerate(reversed(lst)) if x==c), len(lst))
    return idx

In [None]:
lst1 = [1,2,3,4,5,6,7,8,9,10]
lst2 = [1,0,0,0,1,0,1,0,0,0]
print(lst1)
print(lst2)
print('----------------------')
for i,item in enumerate(lst2):
    if item==1:
        idx = last_of_list(lst2,0)
        if i>=idx:
            break
        lst1[i],lst1[idx]=lst1[idx],lst1[i]
        lst2[i],lst2[idx]=lst2[idx],lst2[i]
        print(lst1)
        print(lst2)
        print('----------------------')
#         lst1.pop()
print(lst1)
print(lst2)

In [None]:
len(lst2) - 1 - next((i for i,x in enumerate(reversed(lst2)) if x==1), len(lst2))

In [None]:
import time
start = time.time()
a = lst.pop()
A=time.time()-start

In [None]:
import time
start = time.time()
a = lst[0]
lst = lst[1:]
B=time.time()-start

In [None]:
A-B

In [2]:
import numpy as np
A = np.zeros(100).reshape([10,10])

In [4]:
np.array(A!=0,dtype=int).sum(1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])