In [66]:
import numpy as np
import tensorflow as tf
import collections

### Reading data and converting to bigrams

In [67]:
def read_data(file):
    with open(file, 'r') as f:
        text = f.read()
        text = text.replace('\n', '')
        start_idx = text.find('In Benares')
        end_idx = text.find('FOOTNOTES')
        text = text[start_idx:end_idx]
        text = text.lower().strip()
        
        bigram_text = [''.join(text[char:char + 2]) for char in range(0, len(text) - 2, 2)]
        
    return bigram_text

In [68]:
bigram_text = read_data('2400-0.txt')
print('no. of bigrams:', len(bigram_text))
bigram_text[0:10]

no. of bigrams: 162287


['in', ' b', 'en', 'ar', 'es', ' o', 'nc', 'e ', 're', 'ig']

### Creating Dictionary

In [69]:
def create_dict(bigrams):
    
    count = []
    count.extend(collections.Counter(bigrams).most_common())
    
    dictionary = dict({'UNK':0})
    for char, freq in count:
    
        dictionary[char] = len(dictionary)    
            
    rev_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return dictionary, rev_dictionary, count


In [70]:
dictionary, rev_dictionary, count = create_dict(bigram_text)
vocab_size = len(dictionary)

print('dictionary', list(dictionary)[:10])
print('reverse dictionary', list(rev_dictionary)[:10])
print('most common words:', count[0:5])
print('len of dictionary:', len(dictionary))

dictionary ['37', 'nu', 'mw', 'un', 'p,', 'pu', 'n)', 'b-', '?â€™', 's ']
reverse dictionary [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
most common words: [('e ', 4702), (' t', 4538), ('he', 4474), ('th', 4451), ('d ', 3014)]
len of dictionary: 1045


### Converting from bigrams to int

In [71]:
def bigram_to_int(bigram_text, dictionary):
    
    bigram_int = []
    
    for bigram in bigram_text:
        
        if bigram in dictionary:
            bigram_int.append(dictionary[bigram])
        else:
            bigram_int.append(dictionary['UNK'])

    return bigram_int

In [72]:
bigram_int = bigram_to_int(bigram_text, dictionary)
print(list(bigram_int[0:10]))

[9, 23, 24, 44, 34, 19, 118, 1, 17, 155]


### Generating batches

In [73]:
def next_batch(bigrams, batch_size, num_unroll):
    global vocab_size
    
    segments = len(bigrams) // batch_size
    
    cursor = [(offset * segments) + num_unroll for offset in range(batch_size)]
    
    
    batch_data = np.zeros((batch_size,vocab_size),dtype=np.float32)
    batch_labels = np.zeros((batch_size,vocab_size),dtype=np.float32)

    
    for b in range(batch_size):
        
        if cursor[b]+1>= len(bigrams):
            cursor[b] = b * segments
        
        batch_data[b, bigram_int[cursor[b]]] = 1.0
        batch_labels[b, bigram_int[cursor[b] + 1 ]] = 1.0
        
        cursor[b] = (cursor[b]+1)%len(bigrams)
    
    return batch_data, batch_labels

### Unrolling batches

In [74]:
def unroll_batches(bigram_int, num_unroll, batch_size):
    
    unroll_data = []
    unroll_labels = []
    
    for u_idx in range(num_unroll):
        
        batch_data, batch_labels = next_batch(bigram_int, batch_size, u_idx)
        
        unroll_data.append(batch_data)
        unroll_labels.append(batch_labels)
        
    return unroll_data, unroll_labels

In [75]:
unroll_data, unroll_labels = unroll_batches(bigram_int[:50] , 10, 5)
print('unroll data shape:', np.array(unroll_data).shape)
print('unroll labels shape:', np.array(unroll_labels).shape)

for ui,(dat,lbl) in enumerate(zip(unroll_data,unroll_labels)):   
    print('\n\nUnrolled index %d'%ui)
    dat_ind = np.argmax(dat,axis=1)
    lbl_ind = np.argmax(lbl,axis=1)
    print('\tInputs:')
    for sing_dat in dat_ind:
        print('\t%s (%d)'%(rev_dictionary[sing_dat],sing_dat),end=", ")
    print('\n\tOutput:')
    for sing_lbl in lbl_ind:        
        print('\t%s (%d)'%(rev_dictionary[sing_lbl],sing_lbl),end=", ")

unroll data shape: (10, 5, 1045)
unroll labels shape: (10, 5, 1045)


Unrolled index 0
	Inputs:
	in (9), 	ne (68), 	 b (23), 	t, (175), 	n  (15), 
	Output:
	 b (23), 	d  (5), 	y  (29), 	 t (2), 	va (216), 

Unrolled index 1
	Inputs:
	 b (23), 	d  (5), 	y  (29), 	 t (2), 	va (216), 
	Output:
	en (24), 	a  (59), 	na (176), 	o  (32), 	jr (437), 

Unrolled index 2
	Inputs:
	en (24), 	a  (59), 	na (176), 	o  (32), 	jr (437), 
	Output:
	ar (44), 	mi (142), 	me (57), 	wh (70), 	am (125), 

Unrolled index 3
	Inputs:
	ar (44), 	mi (142), 	me (57), 	wh (70), 	am (125), 
	Output:
	es (34), 	gh (132), 	 p (51), 	os (139), 	uk (429), 

Unrolled index 4
	Inputs:
	es (34), 	gh (132), 	 p (51), 	os (139), 	uk (429), 
	Output:
	 o (19), 	ty (178), 	ra (60), 	ee (102), 	ut (98), 

Unrolled index 5
	Inputs:
	 o (19), 	ty (178), 	ra (60), 	ee (102), 	ut (98), 
	Output:
	nc (118), 	 p (51), 	ta (105), 	ig (155), 	 h (11), 

Unrolled index 6
	Inputs:
	nc (118), 	 p (51), 	ta (105), 	ig (155), 	 h (11), 
	Ou

### Recurrent Neural Network