In [1]:
import tensorflow as tf
tf.enable_eager_execution()
import pandas as pd
import pickle
import numpy as np

In [2]:
import os
dictionary = {'articles':[],'titles':[],'subtitles':[]}
for d in os.listdir('bbc'):
    if d=='README.TXT' or d=='.DS_Store':
        continue
    for f in os.listdir('bbc/' + d):
        if f=='.DS_Store':
            continue
        try:   
            lines = open('bbc/' + d + '/' + f).readlines()
        except UnicodeDecodeError:
            print(f)
            print(d)
            continue
        content = ' '.join(lines[3:]).replace('\n',' ')
        title = lines[0]
        subtitle = lines[2]
        dictionary['articles'].append(content)
        dictionary['titles'].append(title)
        dictionary['subtitles'].append(subtitle)

199.txt
sport


In [3]:
from nltk.tokenize import wordpunct_tokenize as wpt

def build_dictionary(docs):
    uniques = set()
    dictionary = {}
    reverse_dictionary={}
    for doc in docs:
        toks = wpt(doc)
        for tok in toks:
            uniques.add(tok)
    for i, tok in enumerate(list(uniques)):
        dictionary[tok] = i
        reverse_dictionary[i] = tok
    return dictionary,reverse_dictionary

In [4]:
words,rwords = build_dictionary(dictionary['titles']+dictionary['subtitles']+dictionary['articles'])

In [5]:
index_dict = {'articles':[],'titles':[],'subtitles':[]}
def numerify(text):
    toks = wpt(text)
    return np.array(list(map(lambda x: words[x],toks)))

for article,title,subtitle in zip(dictionary['articles'],dictionary['titles'],dictionary['subtitles']):
    index_dict['articles'].append(numerify(article))
    index_dict['titles'].append(numerify(title))
    index_dict['subtitles'].append(numerify(subtitle))

In [6]:
words['<STOP>'] = len(words)
rwords[max(rwords.keys())+1] = '<STOP>'

In [7]:
#we should fill in the excess characters in the titles with stops (11 is max length of a title)
for i in range(len(index_dict['titles'])):
    to_add = []
    title = index_dict['titles'][i]
    for k in range(11-title.shape[0]):
        to_add.append(words['<STOP>'])
    together = np.concatenate([title,np.array(to_add)],axis=0)
    index_dict['titles'][i] = together
assert list(set([x.shape[0] for x in index_dict['titles']])) == [11]

In [8]:
print(index_dict['titles'][100])

[ 7335 10299  8704  3512 17565 33757 33757 33757 33757 33757 33757]


In [10]:
pickle.dump(index_dict,open('index_dict.pkl','wb'))
pickle.dump(words,open('dictionary.pkl','wb'))
pickle.dump(rwords,open('reverse_dictionary.pkl','wb'))

In [2]:
index_dict = pickle.load(open('index_dict.pkl','rb'))
words = pickle.load(open('dictionary.pkl','rb'))
rwords = pickle.load(open('reverse_dictionary.pkl','rb'))

In [4]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_size,batch_sz,enc_units):
        super(Encoder,self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    def call(self,inp,init_state):
        x = self.embedding(inp)
        output,state = self.gru(x,initial_state=init_state)
        return output, state

    def init_hidden(self):
        return tf.zeros(shape=[self.batch_sz,self.enc_units])

In [5]:
class Attention(tf.keras.layers.Layer):
    def __init__(self,window_size,enc_units):
        super(Attention,self).__init__()
        self.enc_units = enc_units
        self.window_size = window_size
        
        self.score_W = tf.keras.layers.Dense(self.enc_units) #must be the same size as hs
        
        self.window_W1 = tf.keras.layers.Dense(10,activation='tanh')
        self.window_W2 = tf.keras.layers.Dense(1,activation='softmax')
        
        
        
    def score(self,ht,hs):
        return tf.matmul(hs,tf.transpose(self.score_W(ht))) #1xenc_units times enc_unitsx1 (I think...)
        
    def call(self,source,ht): #source should be [batch, num_words, dims]
        
        S = tf.cast(source.shape[1],tf.float32)
        pt = self.window_W2(self.window_W1(ht))*S
        rounded = int(np.round(pt))
        unstacked_source = tf.unstack(source,axis=1) #giving us all the "hs" we need
        
        gaussians = []
        alignment_partials = []
        
        lower_bound = rounded-self.window_size-1
        if lower_bound<0:
            lower_bound = 0
        upper_bound = rounded+self.window_size+1
        if upper_bound>len(unstacked_source):
            upper_bound = len(unstacked_source)
        
        for i in range(lower_bound,upper_bound):
            gaussian = tf.math.exp(-tf.math.square(pt-i)/2/tf.math.square(self.window_size/2))
            alignment_partial = tf.math.exp(self.score(ht,unstacked_source[i]))
            alignment_partials.append(alignment_partial)
            gaussians.append(gaussian)
        
        alignment_sum = tf.reduce_sum(alignment_partials)
        attention_vecs = []
        for partial,gauss,hs_ in zip(alignment_partials,gaussians,unstacked_source[rounded-self.window_size:rounded+self.window_size+1]):
            attention_vecs.append(partial/alignment_sum*gauss*hs_)
        return tf.reduce_sum(attention_vecs,axis=0)*(1/(2*self.window_size+1)) #this should have size enc_units
    
    

In [6]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz,enc_units):
        super(Decoder,self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences = True,
                                       return_state = True,
                                       recurrent_initializer = 'glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = Attention(10,enc_units)
    
    def call(self,x,hidden,enc_output):
        context = self.attention(enc_output,hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context, 1), x], axis=-1)
        output,state = self.gru(x)
        
        output = tf.reshape(output, (-1, output.shape[2])) #do this b/c we're doing just 1 timestep
        return self.fc(output),state

In [7]:
batch_size = 1
embedding_dim = 64
enc_units = 256
dec_units = 256
vocab_size = len(words)



In [9]:
encoder = Encoder(vocab_size,embedding_dim,batch_size,enc_units)
decoder = Decoder(vocab_size,embedding_dim,dec_units,batch_size,enc_units)

sample = index_dict['articles'][0]
test_enc = encoder(np.reshape(sample,[1,sample.shape[0]]),encoder.init_hidden())
print(test_enc[0].shape)
print(test_enc[1].shape)

sample_dec = index_dict['titles'][0]
test_dec = decoder(np.reshape(sample_dec[0],[1,1]),test_enc[1],test_enc[0])
print(test_dec[0].shape)
print(test_dec[1].shape)
#woot!!! Everything works here!

(1, 426, 256)
(1, 256)
(1, 33758)
(1, 256)


In [10]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    loss_ = loss_object(real, pred)
    return tf.reduce_mean(loss_)
    

In [33]:
@tf.function
def train_step(batchx,batchy):
    with tf.GradientTape() as tape:
        encoded,state = encoder(batchx,encoder.init_hidden())
        #losses = tf.zeros(shape=batchx.shape[0]-1)
        for i in range(batchx.shape[0]-1):
            pred,state = decoder(np.reshape(batchx[i],[1,1]),state,encoded)
            loss = loss_function(batchy[i+1],pred)
            grad = tape.gradient(loss_sum, encoder.trainable_variables+decoder.trainable_variables)
            print(grad)
#         loss_sum = tf.reduce_sum(losses)
#         batch_loss = loss_sum/10
#         variables = encoder.trainable_variables + decoder.trainable_variables
#         gradients = tape.gradient(loss_sum, variables)
#         print(gradients)
#         optimizer.apply_gradients(zip(gradients, variables))
#     return batch_loss
            

In [35]:
for n in range(len(index_dict['titles'])):
    batchx = index_dict['articles'][n]
    bx = np.reshape(batchx,[1,batchx.shape[0]])
    l=train_step(bx,index_dict['titles'][n])

In [39]:
batch_size=1

losser = loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

trainable_vars = emb_layer.trainable_variables + enc_layer.trainable_variables + dec_gru.trainable_variables + out_W.trainable_variables
optimizer = tf.keras.optimizers.Adam()

        

In [133]:
from tensorflow.keras import layers

class SimpleModel(tf.keras.Model):
    def __init__(self):
        super(SimpleModel,self).__init__()
        #now for the fast, easy version without attention!

        self.seq_len=11
        self.emb_layer = layers.Embedding(len(words),100)
        self.enc_layer = layers.GRU(return_state=True,units=256)
        self.transition = layers.Dense(100)
        self.dec_gru = layers.GRU(return_state=True,return_sequences=True,units=256)
        self.to_embed = layers.Dense(100)
        self.out_W = layers.Dense(len(words),activation='softmax')

    def call(self,article):
        embedded = self.emb_layer(article)
        encoded,state = self.enc_layer(embedded)
        transition = tf.reshape(self.transition(encoded),[1,1,100])
        out = tf.TensorArray(tf.float32,size=11)
        for i in range(11):
            temp,state = self.dec_gru(transition,initial_state = state)
            transition = tf.reshape(self.to_embed(temp),[1,1,100])
            out.write(i,tf.squeeze(self.out_W(transition)))
        return out.stack()
            

In [131]:
art1 = index_dict['articles'][0]
# art2 = index_dict['articles'][1]
art1 = np.reshape(art1,[1,art1.shape[0]])
# art2 = np.reshape(art2,[1,art2.shape[0]])
simp_model = SimpleModel()
print(simp_model(art1).shape)

(11, 33758)


In [13]:
optimizer = tf.keras.optimizers.Adam()
losser = tf.keras.losses.SparseCategoricalCrossentropy()

In [125]:


def train_batch_simple(articlebatch,titlebatch):
    with tf.GradientTape() as tape:
        pred = simp_model(articlebatch)
        loss = losser(titlebatch,pred)
        grad = tape.gradient(loss,simp_model.trainable_variables)
        optimizer.apply_gradients(zip(grad,simp_model.trainable_variables))
        #print(np.sum([tf.reduce_sum(g) for g in grad]))
    return loss

In [135]:
import time
for k in range(10000):
    t = time.time()
    ind = np.random.randint(0,len(index_dict['titles']))
    xbatch = index_dict['articles'][ind]
    xbatch = np.reshape(xbatch,[1,xbatch.shape[0]])
    ybatch = index_dict['titles'][ind]
    print(time.time()-t)
    l = train_batch_simple(xbatch,ybatch)
    print(time.time()-t)
    print(l)

0.016065120697021484
3.8235268592834473
tf.Tensor(10.426876, shape=(), dtype=float32)
0.0008268356323242188
5.693143129348755
tf.Tensor(10.425727, shape=(), dtype=float32)
9.703636169433594e-05
3.6416802406311035
tf.Tensor(10.421902, shape=(), dtype=float32)
9.202957153320312e-05


KeyboardInterrupt: 

In [73]:
simp_model.summary()

Model: "simple_model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     multiple                  3375800   
_________________________________________________________________
gru_27 (GRU)                 multiple                  274176    
_________________________________________________________________
dense_33 (Dense)             multiple                  25700     
_________________________________________________________________
gru_28 (GRU)                 multiple                  274176    
_________________________________________________________________
dense_34 (Dense)             multiple                  25700     
_________________________________________________________________
dense_35 (Dense)             multiple                  3409558   
Total params: 7,385,110
Trainable params: 7,385,110
Non-trainable params: 0
_________________________________________

In [136]:
simp_model.compile(loss=losser,optimizer=optimizer)
simp_model.fit(np.array(index_dict['articles']),np.array(index_dict['titles']),batch_size=1,verbose=1)

ValueError: ('Error when checking model target: expected no data, but got:', array([[ 2455.,  2377., 28177., ..., 33757., 33757., 33757.],
       [ 3358., 29353., 20227., ..., 33757., 33757., 33757.],
       [14635., 24113., 15114., ..., 33757., 33757., 33757.],
       ...,
       [15250., 28947., 29790., ..., 33757., 33757., 33757.],
       [21014., 26166., 21597., ..., 33757., 33757., 33757.],
       [ 6811., 17129.,  2377., ..., 33757., 33757., 33757.]]))

In [27]:
#let's just try this all with keras
import tensorflow as tf
from tensorflow.keras import layers

inp = tf.keras.Input(shape=[None],dtype=tf.int32)
emb_enc = layers.Embedding(len(words),100)(inp)
enc,state = layers.GRU(return_state=True,return_sequences=True,units=256)(emb_enc)

dec_gru_ = layers.GRU(return_sequences=True,return_state=True,units=256)
out=tf.TensorArray(dtype=tf.float32,size=11)
word_final_ = layers.Dense(len(words),activation='softmax')
for i in range(int(out.size())):
    enc,state = dec_gru_(enc,initial_state=state)
    out.write(i,word_final_(enc))
final = out.stack()

model_comp = tf.keras.models.Model(inputs=inp,outputs=final)


In [28]:
model_comp.compile(loss=losser,optimizer=optimizer)

In [29]:
print(len(index_dict['articles']))
print(len(index_dict['titles']))

2224
2224


In [30]:
X=tf.ragged.constant(index_dict['articles'])
Y=tf.ragged.constant(index_dict['titles'])

In [31]:
model_comp.fit(X,Y,batch_size=1,verbose=1)

TypeError: unhashable type: 'Dimension'