This notebook is specifically for **Abstractive Summarization** only. In this approach, a model generates new sentences that capture the main ideas of the original text, often rephrasing and synthesizing information. This method requires more advanced natural language processing techniques, such as sequence-to-sequence models and attention mechanisms.

In [13]:
# Sample dataset of texts and their summaries
texts = [
    "The climate crisis is worsening more than ever due to increasing CO2 emissions from fossil fuels.",
    "Artificial intelligence and machine learning are revolutionizing technology by automating tasks.",
    "Healthy diet and regular exercise are key to maintaining good health.",
    "Blockchain technology is becoming a foundational element in the new age of the internet.",
    "Renewable energy sources like solar and wind are essential for sustainable development."
]

summaries = [
    "Climate crisis worsens with rising CO2 emissions.",
    "AI and machine learning revolutionize technology.",
    "Good health requires a healthy diet and exercise.",
    "Blockchain is pivotal in the new internet age.",
    "Renewable energy is crucial for sustainability."
]

# Manually prepend 'startseq' and append 'endseq' to each summary for training
summaries = ['startseq ' + summary + ' endseq' for summary in summaries]

# Example of updated summaries
print(summaries[:2])

['startseq Climate crisis worsens with rising CO2 emissions. endseq', 'startseq AI and machine learning revolutionize technology. endseq']


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts + summaries)

# Convert texts and summaries into sequences of integers
seq_texts = tokenizer.texts_to_sequences(texts)
seq_summaries = tokenizer.texts_to_sequences(summaries)

# Pad sequences
max_text_len = max(len(seq) for seq in seq_texts)
max_summary_len = max(len(seq) for seq in seq_summaries)

padded_texts = pad_sequences(seq_texts, maxlen=max_text_len, padding='post')
padded_summaries = pad_sequences(seq_summaries, maxlen=max_summary_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token


In [15]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Encoder
encoder_inputs = Input(shape=(max_text_len,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=50)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(100, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=50)(decoder_inputs)
decoder_lstm = LSTM(100, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy')
model.summary()


In [16]:
import numpy as np

# Shifting the summaries for decoder input
decoder_input_data = np.zeros_like(padded_summaries)
decoder_input_data[:, 1:] = padded_summaries[:,:-1]
decoder_input_data[:, 0] = 0 # Start token, assuming 0 is not used by tokenizer

# Converting decoder output data to one-hot encoded form
decoder_output_data = np.zeros((len(padded_summaries), max_summary_len, vocab_size), dtype='float32')

for i, seq in enumerate(padded_summaries):
    for t, word_index in enumerate(seq):
        if t > 0:  # decoder_target_data will be ahead by one timestep and will not include the start token.
            decoder_output_data[i, t - 1, word_index] = 1.


In [17]:
model.fit([padded_texts, decoder_input_data], decoder_output_data,
          batch_size=64,
          epochs=100,
          validation_split=0.2)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 3.7854 - val_loss: 3.7816
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 3.7796 - val_loss: 3.7800
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 3.7738 - val_loss: 3.7782
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 3.7677 - val_loss: 3.7763
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 3.7611 - val_loss: 3.7740
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 3.7538 - val_loss: 3.7711
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 3.7454 - val_loss: 3.7677
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 3.7356 - val_loss: 3.7633
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1fa4e03d210>

In [18]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
decoder_state_input_h = Input(shape=(100,))
decoder_state_input_c = Input(shape=(100,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

# Decoder model for inference
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Chose the 'startseq' word as the first word of the target sequence
    target_seq[0, 0] = tokenizer.word_index['startseq']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_map.get(sampled_token_index, '?')
        if sampled_word != 'endseq':
            decoded_sentence += ' ' + sampled_word

        # Exit condition: hit max length or find stop word.
        if sampled_word == 'endseq' or len(decoded_sentence.split()) >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

from rouge import Rouge

# You may need to adjust this to work with your data preparation steps
def evaluate_summaries(model, texts, actual_summaries):
    input_seqs = tokenizer.texts_to_sequences(texts)
    input_seqs = pad_sequences(input_seqs, maxlen=max_text_len, padding='post')
    
    generated_summaries = [decode_sequence(np.array(seq).reshape(1, max_text_len)) for seq in input_seqs]
    
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, actual_summaries, avg=True)

    print("ROUGE-1: ", scores['rouge-1']['f'])
    print("ROUGE-2: ", scores['rouge-2']['f'])
    print("ROUGE-L: ", scores['rouge-l']['f'])

# Calling the evaluation function with our texts and summaries
evaluate_summaries(model, texts, summaries)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [22]:
texts = ["I am the best man in the whole world", " Modi is a dictator", "Universal Basic Income should be implemented very soon because then I do not have to work for money but work for pleasure and just be happy"," Abstractive text summarization is working now goodness gracious", " This is just a random text to see how the model summarizes now "]

def display_generated_summaries(model, texts, actual_summaries, num_samples=5):
    input_seqs = tokenizer.texts_to_sequences(texts)
    input_seqs = pad_sequences(input_seqs, maxlen=max_text_len, padding='post')
    
    generated_summaries = []
    for seq in input_seqs[:num_samples]:
        generated_summary = decode_sequence(np.array(seq).reshape(1, max_text_len))
        generated_summaries.append(generated_summary)
    
    # Display the actual and generated summaries
    for i in range(num_samples):
        print(f"Original Text: {texts[i]}")
        print(f"Actual Summary: {actual_summaries[i]}")
        print(f"Generated Summary: {generated_summaries[i]}\n")

# Calling the function to display summaries
display_generated_summaries(model, texts, summaries)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20