In [17]:
import numpy as np
import pandas as pd

df = pd.read_csv("processed_dataset.csv")

text = list(df['text'])
headlines = list(df['headline'])
temp = []
temp.extend(text)
temp.extend(headlines)

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

SENLEN = 400
HEADLINES_SENLEN = 20
MAXWORDS = 20000

HEADLINES_MAXWORDS = 3000

text_tokenizer = Tokenizer(num_words = MAXWORDS)
text_tokenizer.fit_on_texts(text)

headline_tokenizer = Tokenizer(num_words = HEADLINES_MAXWORDS)
headline_tokenizer.fit_on_texts(headlines)

def preprocess_sequences(text, seq_type):
    
    if(seq_type == "text"):
        return pad_sequences(text_tokenizer.texts_to_sequences(text), maxlen = SENLEN, padding='pre')        
    elif(seq_type == "headline"):
        return pad_sequences(headline_tokenizer.texts_to_sequences(text), maxlen = HEADLINES_SENLEN, padding='pre')


In [19]:
text_sequences = preprocess_sequences(text, "text")
headline_sequences = preprocess_sequences(headlines, "headline")

In [4]:
embedding_dict = {}

with open("../glove/archive/glove.6B.100d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],"float32")
        embedding_dict[word] = vectors
f.close()

embedding_dim = 100
num_words = MAXWORDS + 1
embedding_matrix = np.zeros((MAXWORDS, embedding_dim))

for word, i in text_tokenizer.word_index.items():
    if i < MAXWORDS:
        embedding_vector = embedding_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
embedding_matrix_headlines = np.zeros((HEADLINES_MAXWORDS, embedding_dim))

for word, i in headline_tokenizer.word_index.items():
    if i < HEADLINES_MAXWORDS:
        embedding_vector = embedding_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix_headlines[i] = embedding_vector

In [16]:
import tensorflow as tf

latent_dim = 300

input_layer = tf.keras.layers.Input(shape = (), dtype = np.int32, name="input_layer")
embedding_layer = tf.keras.layers.Embedding(num_words, embedding_dim, weights = [embedding_matrix], input_length = SENLEN,trainable = False, name = "Embedding_layer_enc")(input_layer)

#Encoder

#encoder lstm 1
encoder_lstm1 = tf.keras.layers.LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.2)
encoder_output1, state_h1, state_c1 = encoder_lstm1(embedding_layer)

#encoder lstm 2
encoder_lstm2 = tf.keras.layers.LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.2)
encoder_output2, state_h, state_c = encoder_lstm2(encoder_output1)

decoder_inputs = tf.keras.layers.Input(shape=(None,))

#embedding layer
dec_emb_layer = tf.keras.layers.Embedding(num_words, embedding_dim, weights = [embedding_matrix], input_length = HEADLINES_SENLEN,trainable = False, name = "Embedding_layer_dec")
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.2,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#dense layer
decoder_dense =  tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(HEADLINES_SENLEN, activation='relu'))
decoder_outputs = decoder_dense(decoder_outputs)

model = tf.keras.models.Model([input_layer, decoder_inputs], decoder_outputs)
model.summary()


ValueError: Layer Embedding_layer_enc weight shape (20001, 100) is not compatible with provided weight shape (20000, 100).

In [7]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

In [8]:
history = model.fit(text_sequences, headline_sequences,epochs = 50, callbacks=[es], batch_size=32,verbose = 1)

Epoch 1/50


ValueError: in user code:

    File "/Users/sampathroutu/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/Users/sampathroutu/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/sampathroutu/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/Users/sampathroutu/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "/Users/sampathroutu/opt/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/sampathroutu/opt/anaconda3/lib/python3.8/site-packages/keras/engine/input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "model_1" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 400) dtype=int32>]


In [10]:
text_sequences.shape

(2225, 400)