In [1]:
seed = 42
import numpy as np
np.random.seed(seed)
import torch
torch.manual_seed(seed)

<torch._C.Generator at 0x10da99cb0>

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test. Data was downloaded from [train](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt), [validation](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt) and [test](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt). 

In [2]:
def corpus_counts(path, verbose=False): 
    # Open training file 
    corpus = []
    with open(path, 'r') as infile: 
        for line in infile: 
            line = line[:-1].split()
            corpus.append(line)
            

    # Count occurrences
    unique, counts = np.unique(np.array([item for sublist in corpus for item in sublist]), return_counts=True)
    corpus_counts = dict(zip(unique, counts))
    
    if verbose: 
        for v, k in sorted(zip(counts, unique), reverse=True): 
            print('Key is "{0}" with count {1}'.format(k, v))

    return corpus, corpus_counts

In [12]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

## Data loader

In [20]:
### Train_iterator
def train_loader(path, context_size): 
    # Build vocabulary
    train, train_counts = corpus_counts(path, verbose=False)
    vocab_size = len(train_counts)
    word_to_idx = {word: i for i, word in enumerate(train_counts.keys())}
    
    # Make contexts
    word_data, word_targets = [], []
    for line in train: 
        print(line)
        sample, target = [], []
        for i in range(context_size, len(line) - context_size):
            context, c = [], -context_size
            while c <= context_size:
                if c != 0: 
                    context.append(line[i+c])
                c += 1
            target.append(line[i])
            sample.append(context)
            
            # Make context vector
            vec_context = make_context_vector(context=context, word_to_ix=word_to_idx)
            vec_target = make_context_vector(context=target, word_to_ix=word_to_idx)
            
        word_data.append(sample)
        word_targets.append(target)
    
    return word_data, word_targets

# Call function
word_data, word_targets = train_loader(path='data/temp.txt', context_size=2)

['hello', 'world', 'this', 'is', 'awesome', 'to', 'be', 'part', 'of']
['today', 'is', 'a', 'nice', 'day']
['deep', 'learning', 'is', 'cool', 'to', 'do']
['have', 'you', 'seen', 'the', 'board', 'today']
['are', 'you', 'feeling', 'okay', 'with', 'this']
['i', 'like', 'trains', 'very', 'much']


In [21]:
for sample, target in zip(word_data, word_targets): 
    print(sample, target, '\n')

[['hello', 'world', 'is', 'awesome'], ['world', 'this', 'awesome', 'to'], ['this', 'is', 'to', 'be'], ['is', 'awesome', 'be', 'part'], ['awesome', 'to', 'part', 'of']] ['this', 'is', 'awesome', 'to', 'be'] 

[['today', 'is', 'nice', 'day']] ['a'] 

[['deep', 'learning', 'cool', 'to'], ['learning', 'is', 'to', 'do']] ['is', 'cool'] 

[['have', 'you', 'the', 'board'], ['you', 'seen', 'board', 'today']] ['seen', 'the'] 

[['are', 'you', 'okay', 'with'], ['you', 'feeling', 'with', 'this']] ['feeling', 'okay'] 

[['i', 'like', 'very', 'much']] ['trains'] 



# Notes 
Consider including padding in loading functions