In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,title,text,subject,date
0,Greens say no support for Macron's EZ budget i...,BERLIN (Reuters) - None of the German parties ...,worldnews,"October 25, 2017"
1,Trump faces uphill battle to overcome court's ...,(Reuters) - U.S. President Donald Trump faces ...,politicsNews,"February 6, 2017"
2,Ukraine president denies hampering anti-corrup...,VILNIUS/KIEV (Reuters) - Ukrainian President P...,worldnews,"December 8, 2017"
3,U.S. defense chief: White House shakeup will n...,BRUSSELS (Reuters) - U.S. Defense Secretary Ji...,politicsNews,"February 14, 2017"
4,Irish government set to fall weeks before Brex...,DUBLIN (Reuters) - Ireland s minority governme...,worldnews,"November 24, 2017"


In [3]:
df['subject'].unique()

array(['worldnews', 'politicsNews'], dtype=object)

In [4]:
df_world_news = pd.DataFrame(columns = ["title", "text"])
df_politics_news = pd.DataFrame(columns = ["title", "text"])

for i in range(len(df)):
    if df['subject'].iloc[i] == 'worldnews':
        row = pd.Series({'title' : df['title'].iloc[i], 'text' : df['text'].iloc[i]})
        df_world_news = pd.concat([df_world_news, row.to_frame().T], ignore_index = True)
    else:
        row = pd.Series({'title' : df['title'].iloc[i], 'text' : df['text'].iloc[i]})
        df_politics_news = pd.concat([df_politics_news, row.to_frame().T], ignore_index = True)
        

In [5]:
df_world_news.head()

Unnamed: 0,title,text
0,Greens say no support for Macron's EZ budget i...,BERLIN (Reuters) - None of the German parties ...
1,Ukraine president denies hampering anti-corrup...,VILNIUS/KIEV (Reuters) - Ukrainian President P...
2,Irish government set to fall weeks before Brex...,DUBLIN (Reuters) - Ireland s minority governme...
3,Northern Ireland fears Brexit loss of EU peace...,BELFAST (Reuters) - The European Union has lon...
4,Mexican governor requests leave to run for pre...,MEXICO CITY (Reuters) - The governor of Nuevo ...


In [6]:
df_world_news['text'].iloc[0]

'BERLIN (Reuters) - None of the German parties involved in exploratory coalition talks support French President Emmanuel Macron s idea to create a separate budget for the euro zone, a negotiator for the Greens party told Reuters on Wednesday. Reinhard Buetikofer, who participated in a late-night negotiating session on European policy on Tuesday with German Chancellor Angela Merkel s conservatives and the Free Democrats (FDP), said the Greens supported the idea of more investment in infrastructure but not a new budget.  None of the participating parties support a euro zone budget,  Buetikofer, a member of the European Parliament said.  We Greens fully support the idea of finding ways, within the framework of the existing EU budget, to boost investment in infrastructure. We share Macron s aim of increasing investment.  The news is a blow to Macron, who has called for the creation of a euro zone budget of several hundred billions of euros to help the single currency bloc cope with economi

In [7]:
input_data = df_world_news['text']
target_data = df_world_news['title']

target_data_appended = []

#append sos and eos 
for sentence in target_data:
    target_data_appended.append("sos " + sentence + " eos")

In [8]:
max_text = 1000 
max_title = 10

In [9]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

In [10]:
#text processing

x = input_data
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x))

x_seq = x_tokenizer.texts_to_sequences(x) 
x_pad_seq = pad_sequences(x_seq, maxlen = max_text, padding='post') 

x_voc_size = len(x_tokenizer.word_index) +1
print(x_voc_size)

43159


In [11]:
#title processing

y = target_data_appended
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y))

y_seq = y_tokenizer.texts_to_sequences(y) 
y_pad_seq = pad_sequences(y_seq, maxlen = max_title, padding='post') 

y_voc_size = len(y_tokenizer.word_index) +1
print(y_voc_size)

9173


In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional, Attention

In [48]:
# encoder 
encoder_inputs = Input(shape=(max_text,)) 
enc_emb = Embedding(x_voc_size, 256)(encoder_inputs)

encoder_lstm1 = LSTM(256, return_sequences=True, return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) #lstm_1 output 

encoder_lstm2 = LSTM(256,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) #lstm_2 output 

encoder_lstm3 = LSTM(256,return_sequences=True,return_state=True) 
encoder_output3, state_h3, state_c3 = encoder_lstm3(encoder_output2) #lstm_3 output 

encoder_lstm4 = LSTM(256,return_sequences=True,return_state=True) 
encoder_output4, state_h4, state_c4 = encoder_lstm3(encoder_output2) #lstm_4 output 

#decoder
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(y_voc_size, 256) 
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(256, return_sequences=True, return_state=True) 
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h4, state_c4]) 

#Attention Layer
attn_layer = Attention() 
attn_out = attn_layer([decoder_outputs, encoder_output4]) 

decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

decoder_dense = Dense(y_voc_size, activation='softmax') 
decoder_outputs = decoder_dense(decoder_concat_input) 

model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_20 (InputLayer)          [(None, 1000)]       0           []                               
                                                                                                  
 embedding_16 (Embedding)       (None, 1000, 256)    11048704    ['input_20[0][0]']               
                                                                                                  
 lstm_28 (LSTM)                 [(None, 1000, 256),  525312      ['embedding_16[0][0]']           
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                            

In [42]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=["accuracy"])

In [43]:
#pad array of 0's if the length is less than the maximum length 
#en_in_data= pad_sequences(x_train,  maxlen=max_in_len, padding='post') 
#dec_data= pad_sequences(y_train,  maxlen=max_tr_len, padding='post')
 
dec_in = y_pad_seq[:, :-1]
dec_out = y_pad_seq.reshape(len(y_pad_seq),max_title,1)[:,1:]

In [44]:
model.fit( 
    [x_pad_seq, dec_in],
    dec_out, 
    batch_size=128, 
    epochs=5, 
    validation_split=0.1,
    )

Epoch 1/5


2023-01-27 02:56:48.315535: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2023-01-27 03:04:31.361834: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/5

KeyboardInterrupt: 

In [27]:
model.save("title_predictor")

2023-01-27 02:45:33.085616: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: s2s/assets


INFO:tensorflow:Assets written to: s2s/assets


In [29]:
from tensorflow.keras import models

In [30]:
model = models.load_model("title_predictor")

en_outputs, state_h_enc, state_c_enc = model.layers[6].output
en_states = [state_h_enc,state_c_enc]

en_model = Model(model.input[0], [en_outputs]+en_states)

In [31]:
dec_state_input_h = Input(shape=(5,))
dec_state_input_c = Input(shape=(5,))

dec_hidden_state_input = Input(shape=(max_text, 5))
 
dec_inputs = model.input[1]
dec_emb_layer = model.layers[5]
dec_lstm = model.layers[7]
dec_embedding= dec_emb_layer(dec_inputs)
 
dec_outputs2, state_h2, state_c2 = dec_lstm(dec_embedding, initial_state = [dec_state_input_h, dec_state_input_c])

In [32]:
attention = model.layers[8]
attn_out2 = attention([dec_outputs2, dec_hidden_state_input])
 
merge2 = Concatenate(axis=-1)([dec_outputs2, attn_out2])

In [33]:
dec_dense = model.layers[10]
dec_outputs2 = dec_dense(merge2)
 

dec_model = Model( [dec_inputs] + [dec_hidden_state_input,dec_state_input_h,dec_state_input_c], [dec_outputs2] + [state_h2, state_c2])

In [39]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index
reverse_target_word_index[0]=' '
 
def decode_sequence(input_seq):
    en_out, en_h, en_c= en_model.predict(input_seq)
 
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_word_index['sos']
 
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition: 
        output_words, dec_h, dec_c= dec_model.predict([target_seq] + [en_out,en_h, en_c])
        
        word_index = np.argmax(output_words[0, -1, :])
        text_word = reverse_target_word_index[word_index]
        decoded_sentence += text_word +" "
        
        if text_word == "eos" or len(decoded_sentence) > max_title:
            stop_condition = True

            
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = word_index
        en_h, en_c = dec_h, dec_c
        
    return decoded_sentence

In [40]:
inp_review = input("Enter : ")
inp_x = x_tokenizer.texts_to_sequences([inp_review]) 
inp_x = pad_sequences(inp_x,  maxlen = max_text, padding='post')
 
title = decode_sequence(inp_x.reshape(1,max_text))

if 'eos' in summary :
    title = title.replace('eos','')
    
print("Title : ", title)

Enter : Generate title for this text
Review : Generate title for this text

Predicted summary: to to to to 


