In [1]:
seed = 42
import numpy as np
np.random.seed(seed)
import torch
torch.manual_seed(seed)

<torch._C.Generator at 0x10da99cb0>

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test. Data was downloaded from [train](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt), [validation](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt) and [test](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt). 

In [2]:
def corpus_counts(path, verbose=False): 
    # Open training file 
    corpus = []
    with open(path, 'r') as infile: 
        for line in infile: 
            line = line[:-1].split()
            corpus.append(line)
            

    # Count occurrences
    unique, counts = np.unique(np.array([item for sublist in corpus for item in sublist]), return_counts=True)
    corpus_counts = dict(zip(unique, counts))
    
    if verbose: 
        for v, k in sorted(zip(counts, unique), reverse=True): 
            print('Key is "{0}" with count {1}'.format(k, v))

    return corpus, corpus_counts

In [12]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

## Data loader

In [24]:
### Train_iterator
def train_loader(path, window_size): 
    # Build vocabulary
    train, train_counts = corpus_counts(path, verbose=False)
    vocab_size = len(train_counts)
    word_to_idx = {word: i+1 for i, word in enumerate(train_counts.keys())}
    word_to_idx['padding'] = 0
    
    # Run through each sample
    word_data, word_targets = [], []
    vec_data, vec_targets = [], []
    for line in train: 
        print(line)
        
        # Add padding corresponding to the size of the window on either side
        padding = ['padding']*window_size
        line = padding+line+padding
        
        # Make contexts
        sample, target = [], []
        for i in range(window_size, len(line) - window_size):
            context, c = [], -window_size
            while c <= window_size:
                if c != 0: 
                    context.append(line[i+c])
                c += 1
            target.append(line[i])
            sample.append(context)
            
            # Make context vector
            vec_context = make_context_vector(context=context, word_to_ix=word_to_idx)
            vec_target = make_context_vector(context=target, word_to_ix=word_to_idx)
            vec_data.append(vec_context)
            vec_targets.append(vec_target)
            
        word_data.append(sample)
        word_targets.append(target)
    
    return word_data, word_targets

# Call function
word_data, word_targets = train_loader(path='data/temp.txt', window_size=2)

['hello', 'world', 'this', 'is', 'awesome', 'to', 'be', 'part', 'of']
['today', 'is', 'a', 'nice', 'day']
['deep', 'learning', 'is', 'cool', 'to', 'do']
['have', 'you', 'seen', 'the', 'board', 'today']
['are', 'you', 'feeling', 'okay', 'with', 'this']
['i', 'like', 'trains', 'very', 'much']


In [25]:
for sample, target in zip(word_data, word_targets): 
    print(sample, target, '\n')

[['padding', 'padding', 'world', 'this'], ['padding', 'hello', 'this', 'is'], ['hello', 'world', 'is', 'awesome'], ['world', 'this', 'awesome', 'to'], ['this', 'is', 'to', 'be'], ['is', 'awesome', 'be', 'part'], ['awesome', 'to', 'part', 'of'], ['to', 'be', 'of', 'padding'], ['be', 'part', 'padding', 'padding']] ['hello', 'world', 'this', 'is', 'awesome', 'to', 'be', 'part', 'of'] 

[['padding', 'padding', 'is', 'a'], ['padding', 'today', 'a', 'nice'], ['today', 'is', 'nice', 'day'], ['is', 'a', 'day', 'padding'], ['a', 'nice', 'padding', 'padding']] ['today', 'is', 'a', 'nice', 'day'] 

[['padding', 'padding', 'learning', 'is'], ['padding', 'deep', 'is', 'cool'], ['deep', 'learning', 'cool', 'to'], ['learning', 'is', 'to', 'do'], ['is', 'cool', 'do', 'padding'], ['cool', 'to', 'padding', 'padding']] ['deep', 'learning', 'is', 'cool', 'to', 'do'] 

[['padding', 'padding', 'you', 'seen'], ['padding', 'have', 'seen', 'the'], ['have', 'you', 'the', 'board'], ['you', 'seen', 'board', 'toda

# Notes 
Consider including padding in loading functions