Text Summarization



Encoder and decoder without attention


1. Encoder

The encoder processes the input sequence and compresses it into a fixed-size context vector (the final hidden state).

Steps:

a) Embed Input Tokens: Convert input tokens into dense vectors using an embedding layer.

b) Process Through RNN/LSTM/GRU: Pass the embeddings through a recurrent layer.

c) Output Context Vector: Use the final hidden state(s) as the context for the decoder


2. Decoder

The decoder generates the output sequence token by token, starting with the initial input token (e.g., <start>).

Steps:

a) Embed Input Token: Convert the current input token into dense vectors.

b) Combine With Context Vector: Initialize the decoder's hidden and cell states with the encoder's final hidden and cell states.

c) Generate Output Token: Predict the next token using the recurrent layer and a linear layer.

In [None]:
# Import required libraries
!pip install rouge-score # Install the 'rouge-score' package
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer

# Load Dataset
file_path = '/content/Text_summarization.csv'
data = pd.read_csv(file_path)

# Data columns: 'id', 'article', 'highlights'
id = data['id'].values
articles = data['article'].values
highlights = data['highlights'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(articles, highlights, test_size=0.2, random_state=42)

# Tokenizer for input and output
max_vocab_size = 20000
max_input_len = 300  # Max length for articles
max_output_len = 50  # Max length for highlights

input_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<UNK>')
input_tokenizer.fit_on_texts(X_train)
output_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<UNK>')
output_tokenizer.fit_on_texts(y_train)

# Convert text to sequences and pad them
X_train_seq = input_tokenizer.texts_to_sequences(X_train)
X_test_seq = input_tokenizer.texts_to_sequences(X_test)
y_train_seq = output_tokenizer.texts_to_sequences(y_train)
y_test_seq = output_tokenizer.texts_to_sequences(y_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_input_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_input_len, padding='post')
y_train_padded = pad_sequences(y_train_seq, maxlen=max_output_len, padding='post')
y_test_padded = pad_sequences(y_test_seq, maxlen=max_output_len, padding='post')

# Vocabulary sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1



In [None]:

embedding_dim = 128

# Build Encoder-Decoder Model (Without Attention)
def build_encoder_decoder():
    encoder_input = Input(shape=(max_input_len,))
    # The following line is changed to use max_output_len - 1
    decoder_input = Input(shape=(max_output_len - 1,)) #  Decoder input shape should match decoder_input_data

    # Embedding
    encoder_embedding = Embedding(input_vocab_size, embedding_dim, mask_zero=True)(encoder_input)
    decoder_embedding = Embedding(output_vocab_size, embedding_dim, mask_zero=True)(decoder_input)

    # Encoder
    encoder_lstm = LSTM(256, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)

    # Decoder
    decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
    decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

    # Output layer
    dense = Dense(output_vocab_size, activation='softmax')
    output = dense(decoder_output)

    # Reshape the output to match the shape of y_train_padded for sparse_categorical_crossentropy
    # output = tf.reshape(output, (-1, output_vocab_size)) # Original shape: (batch_size, max_output_len, output_vocab_size)
    # output = Reshape((-1, output_vocab_size))(output) # This line is causing the shape mismatch

    return Model([encoder_input, decoder_input], output)

# Build and compile the model
model = build_encoder_decoder()
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


# Prepare Decoder Input and Output
decoder_input_data = y_train_padded[:, :-1] # Decoder input is shifted target sequence
decoder_target_data = y_train_padded[:, 1:] # Decoder target is the original target sequence shifted by one



In [None]:
# ipython-input-13-79fed74d1c3d
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train_seq contains your input sequences
# Use the same max_input_len as defined in the model
X_train_padded = pad_sequences(X_train_seq, maxlen=max_input_len, padding='post', truncating='post')

In [None]:
# ipython-input-14-79fed74d1c3d
# Prepare Decoder Input and Output
# Assuming y_train_padded contains your target sequences
decoder_input_data = y_train_padded[:, :-1]
decoder_target_data = y_train_padded[:, 1:]

In [None]:
# ipython-input-25-375bd95fc266
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# **Replace 'summary' with the actual column name in your 'data' DataFrame**
# For example, if the column is named 'text', use:
y_train_texts = data['text'].iloc[y_train_padded.index].tolist()

# 1. Create a Tokenizer instance
tokenizer = Tokenizer(num_words=output_vocab_size, oov_token='<OOV>') # Set num_words to your desired vocabulary size

# 2. Fit the tokenizer on your training data
tokenizer.fit_on_texts(y_train_texts)

# Re-tokenize target sequences using the same tokenizer as the decoder
y_train_seq = tokenizer.texts_to_sequences(y_train_texts)
y_train_padded = pad_sequences(y_train_seq, maxlen=max_output_len, padding='post', truncating='post')

# Prepare Decoder Input and Output
decoder_input_data = y_train_padded[:, :-1]
decoder_target_data = y_train_padded[:, 1:]

In [None]:
# Clip out-of-range indices to the maximum valid index
decoder_input_data = np.clip(decoder_input_data, 0, output_vocab_size - 1)

In [None]:
# ipython-input-30-e4a6b6322cd3
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Replace 'text' with the actual column name containing your target text
# Check the column names in your 'data' DataFrame using data.columns
target_column_name = 'highlights'  # Replace 'your_actual_column_name'

# Assuming y_train_padded is a NumPy array and you want to select rows from data
# based on the rows represented by y_train_padded, you can use a range of indices:

# Get the number of rows in y_train_padded
num_samples = y_train_padded.shape[0]

# Create a range of indices corresponding to the rows in y_train_padded
indices = np.arange(num_samples)

# Use these indices to select the corresponding rows from data
y_train_texts = data[target_column_name].iloc[indices].tolist()

# 1. Create a Tokenizer instance
tokenizer = Tokenizer(num_words=output_vocab_size, oov_token='<OOV>') # Set num_words to your desired vocabulary size

# 2. Fit the tokenizer on your training data
tokenizer.fit_on_texts(y_train_texts)

# Re-tokenize target sequences using the same tokenizer as the decoder
y_train_seq = tokenizer.texts_to_sequences(y_train_texts)
y_train_padded = pad_sequences(y_train_seq, maxlen=max_output_len, padding='post', truncating='post')

# Prepare Decoder Input and Output
decoder_input_data = y_train_padded[:, :-1]
decoder_target_data = y_train_padded[:, 1:]

# Clip out-of-range indices to the maximum valid index before training
decoder_input_data = np.clip(decoder_input_data, 0, output_vocab_size - 1)
decoder_target_data = np.clip(decoder_target_data, 0, output_vocab_size - 1) # clip target as well/////////////

In [None]:
# Decode sequence
def decode_sequence(sequence, tokenizer):
    """Convert token IDs to text."""
    reverse_vocab = {v: k for k, v in tokenizer.word_index.items()}
    return " ".join([reverse_vocab.get(token, '') for token in sequence if token != 0])////////////////
