<a href="https://colab.research.google.com/github/modhudeb/seq2seq-text-summarizer-tensorflow/blob/main/seq2seq_Text_summ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# imports

In [None]:
# from zipfile import ZipFile

In [None]:
!pip install contractions

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
import pickle
import contractions

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from collections import Counter

# Loading data

In [None]:
# with ZipFile("/content/drive/MyDrive/data/Text_summarize/AmazonReviewSumma.zip") as zf:
#   zf.extractall("/content/drive/MyDrive/data/Text_summarize/")

In [5]:
os.listdir("/content/drive/MyDrive/data/Text_summarize/")

['glove.6B.100d.txt', 'Reviews.csv', 'text_summ_model.h5']

In [6]:
df = pd.read_csv("/content/drive/MyDrive/data/Text_summarize/Reviews.csv")

# Code

In [7]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [8]:
df.dropna(subset=['Summary'], inplace = True)

In [9]:
df = df[['Text', 'Summary']]

In [10]:
print(len(df))
df.head(2)

568427


Unnamed: 0,Text,Summary
0,I have bought several of the Vitality canned d...,Good Quality Dog Food
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised


We don't need all of it. We will take short length texts only

In [11]:
df = df[(df['Text'].apply(lambda x:len(x))<201) & (df['Text'].apply(lambda x:len(x))>150)]
df.reset_index(drop = True, inplace=True)
df = df[(df['Summary'].apply(lambda x:len(x))<100) & (df['Summary'].apply(lambda x:len(x))>20)]
df.reset_index(drop = True, inplace=True)

In [12]:
len(df)   # Now we have this 27726 samples

27726

We need to clean our TEXTs

In [13]:
df['Text'][15000]

"Easy to make & good tasting flavor.<br />The price is  high but you get what you pay for.<br />Hope the Cherrybrook kitchen company doesn't raise the<br />price..."

In [14]:
def clean_text(text):
  """Cleans text by removing extra full stops, punctuations, extra white spaces and html tags."""

  # Remove extra full stops.
  text = re.sub(r'\.+', '.', text)

  # Remove html tags.
  text = re.sub(r'<[^>]*>', '', text)

  # Fixing aporstrophes
  text = contractions.fix(text)

  # Remove other punctuations
  text = re.sub(r'[^\w\s]+', ' ', text)

  # Remove extra white spaces.
  text = re.sub(r'\s+', ' ', text)

  return text.lower()



df['Text'] = df['Text'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

In [15]:
df['Text'][15000]

'easy to make good tasting flavor the price is high but you get what you pay for hope the cherrybrook kitchen company does not raise theprice '

We need to get the TEXTS tokenized and index numbers of them. So we will preprocess the texts now...

In [16]:
# Define max sequence lengths for both text and summary
max_text_length = max(df['Text'].apply(lambda x : len(x.split())))
max_summary_length = max(df['Summary'].apply(lambda x : len(x.split())))


print("Text Max len: ",max_text_length, "\nSummary Max len: ",max_summary_length)

Text Max len:  50 
Summary Max len:  21


In [17]:
df['Dec_Summary'] = df['Summary'].apply(lambda x : "<sos> "+ x +" <eos>")
# Tokenize and pad sequences for Text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(df['Text'])
text_sequences = text_tokenizer.texts_to_sequences(df['Text'])
text_sequences = pad_sequences(text_sequences, maxlen=max_text_length, padding='post')

# Tokenize and pad sequences for Summary
summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(df['Dec_Summary'])
summary_sequences = summary_tokenizer.texts_to_sequences(df['Dec_Summary'])
summary_sequences = pad_sequences(summary_sequences, maxlen=max_summary_length, value=summary_tokenizer.word_index['eos'], padding='post')
summary_sequences_out = summary_tokenizer.texts_to_sequences(df['Summary'])
summary_sequences_out = pad_sequences(summary_sequences_out, maxlen=max_summary_length, value=summary_tokenizer.word_index['eos'], padding='post')



# Define vocabulary sizes
vocab_size_text = len(text_tokenizer.word_index) + 1
vocab_size_summary = len(summary_tokenizer.word_index)+1


In [18]:
print("Text vocab size: ",vocab_size_text, "\nSummary vocab size: ",vocab_size_summary)

Text vocab size:  16946 
Summary vocab size:  8408


We will fetch GloVe vectors for the each vocabulary...

In [None]:
# Loading GloVe embeddings
glove_path = "/content/drive/MyDrive/data/Text_summarize/glove.6B.100d.txt"
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size_text, embedding_dim))

with open(glove_path, 'r', encoding='utf-8') as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        if word in text_tokenizer.word_index:
            idx = text_tokenizer.word_index[word]
            embedding_matrix[idx] = np.array(values[1:], dtype='float32')

# ENCODER-DECODER architecture

In [None]:
latent_dim = 256

# Encoder
encoder_input = layers.Input(shape=(max_text_length,))
encoder_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=embedding_dim,
                              weights=[embedding_matrix], trainable=False)(encoder_input)
encoder_lstm = layers.LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_input = layers.Input(shape=(max_summary_length,))
decoder_embedding = layers.Embedding(input_dim=vocab_size_summary, output_dim=embedding_dim)(decoder_input)
decoder_lstm = layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention Mechanism
attention_layer = layers.Attention()([decoder_outputs, encoder_outputs])
decoder_combined = layers.Concatenate(axis=-1)([decoder_outputs, attention_layer])

# Dense layer for prediction
decoder_dense = layers.Dense(vocab_size_summary, activation='softmax')
output = decoder_dense(decoder_combined)


model = Model([encoder_input, decoder_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


Here we are using the `sparse_categorical_crossentropy`. Because if we use ***categorical_crossentropy*** we need to have <b>one-hot encoded sequence</b>.

But the one-hot sequence requires high config system while training.

In [None]:
model.fit([text_sequences, summary_sequences], summary_sequences_out, epochs=20, batch_size=16, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x79f45a76ee00>

In [None]:
tf.keras.backend.clear_session()

# Model Testing

In [20]:
model = tf.keras.models.load_model("/content/drive/MyDrive/data/Text_summarize/text_summ_model.h5")

In [35]:
def generate_summary(text):
    # Tokenize and pad the input text
    text = clean_text(text).split()
    text = " ".join(text[:50])
    text_sequence = text_tokenizer.texts_to_sequences([text])
    text_sequence = pad_sequences(text_sequence, maxlen=max_text_length, padding='post')

    # Initialize the input for the decoder
    input_seq = np.zeros((1, max_summary_length))
    input_seq[0, 0] = summary_tokenizer.word_index['sos']

    # Generate the summary
    generated_summary = []

    for i in range(1, max_summary_length):
        predictions = model.predict([text_sequence, input_seq], verbose=0)
        predicted_token_idx = np.argmax(predictions[0, i - 1, :])
        if predicted_token_idx == summary_tokenizer.word_index['eos']:
            break
        generated_summary.append(summary_tokenizer.index_word[predicted_token_idx])
        input_seq[0, i] = predicted_token_idx

    return ' '.join(generated_summary)



In [36]:
input_text = "Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted."
predicted_summary = generate_summary(input_text)
print("Predicted Summary:", predicted_summary)

Predicted Summary: received order was wrong


In [37]:
input_text = "I ordered a latte from this cafe, and it was the worst latte I've ever had. The coffee was burnt and bitter, and the milk was frothed to the point where it was just a bunch of bubbles. I tried to drink it, but I couldn't even finish it. It was so bad that I had to throw it away."
predicted_summary = generate_summary(input_text)
print("Predicted Summary:", predicted_summary)

Predicted Summary: mild but still stale


In [39]:
# lets test with some texts from the real full dataset
df = pd.read_csv("/content/drive/MyDrive/data/Text_summarize/Reviews.csv")

In [42]:
ndf = df.sample(10)
ndf.reset_index(drop=True, inplace=True)
for i in range(7):
  txt = ndf['Text'][i]
  print("======================================")
  print('Actual text : ', clean_text(txt))
  print('Real Summary : ', ndf['Summary'][i])
  print('Predicted Summary:', generate_summary(txt))
  print("\n\n")

Actual text :  i just love the senseo coffee maker and pods you practically do not have to be awake to brew a perfect cup of java every morning i can make the fully leaded for myself and the decaf for my husband without dirtying pots finding filters and using anything but our coffee cups and it is perfect every time they are just great for that afternoon cup when you are on the go love my pods 
Real Summary :  Easy Breezy Coffee
Predicted Summary: this is what i like it



Actual text :  back in the day when they were still available in the united states crispy m ms were my favorite m ms so you can imagine my excitement when researching crispy m ms online to see if they might make a comeback in the us when i discovered that i could buy them now from a german company on amazon so i placed an order and they arrived well while they are crispy m ms they do not taste as good as i remember them maybe it is my memory or maybe the chocolate is different in germany they tasted more like malted 

<h3>Bingo !!!😀 It is working really well.</h3>
<h5>We have used very limited size of data and very few vocabulary. Even with this limitations the model is generating quite fine summaries. In some cases the model is predicting better than Actual one.</h5>
</br></br>

<b>Few points should be noted :</b>
>* With high config. system and larger set of data, The model can work very efficiently. <br>
>* Because of small size vocabulary, the model may not build proper sentences.

In [None]:
# save the model
model.save("/content/drive/MyDrive/data/Text_summarize/text_summ_model.h5")

In [None]:
model.save("text_summ_model.h5")