# Abstractive Text Summarization using Deep Learning

## Data Preprocessing and Cleaning

In [1]:
from tqdm import tqdm
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

import re

In [2]:
data = pd.read_csv('Data/amazon-fine-food-reviews.csv')
data.drop_duplicates(inplace=True) # dropping duplicates
data.dropna(axis=0, inplace = True) # dropping empty rows
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 568411 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568411 non-null  int64 
 1   ProductId               568411 non-null  object
 2   UserId                  568411 non-null  object
 3   ProfileName             568411 non-null  object
 4   HelpfulnessNumerator    568411 non-null  int64 
 5   HelpfulnessDenominator  568411 non-null  int64 
 6   Score                   568411 non-null  int64 
 7   Time                    568411 non-null  int64 
 8   Summary                 568411 non-null  object
 9   Text                    568411 non-null  object
dtypes: int64(5), object(5)
memory usage: 47.7+ MB


In [3]:
data = data[:100000]

In [4]:
data = data.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time'], 1)

  data = data.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time'], 1)


In [5]:
# function to carry out contraction expansions
def cleaner(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"y\'all", "you all", phrase)
    return phrase

In [6]:
def clean(column_name):
    preprocessed_text = []
    for sentence in tqdm(data[column_name].values):
        try:
            sentence = sentence.lower() # converting all letters to lowercase
            sentence = re.sub(r"http\S+", "", sentence) # removing urls
            sentence = BeautifulSoup(sentence, 'lxml').get_text() # removing any html tags
            sentence = re.sub(r'\([^)]*\)', '', sentence) # removing any text inside parantheses
            sentence = cleaner(sentence) # contraction expansions
            sentence = re.sub(r"'s\b","", sentence) # removing apostrophes
            sentence = re.sub("\S*\d\S*", "", sentence).strip() # whitespace
            sentence = re.sub('[^A-Za-z]+', ' ', sentence) # removing special characters, punctuation marks
            preprocessed_text.append(sentence.strip())
        except:
            preprocessed_text.append('')
    return preprocessed_text

In [7]:
data['Cleaned Summary'] = clean('Summary')
print("Summaries have been cleaned.")

100%|██████████| 100000/100000 [00:23<00:00, 4301.19it/s]

Summaries have been cleaned.





In [8]:
data['Cleaned Text'] = clean('Text')
print("Texts have been cleaned.")

100%|██████████| 100000/100000 [00:36<00:00, 2719.58it/s]

Texts have been cleaned.





In [9]:
data.to_csv('Data/Cleaned_Data.csv', index = False) # exporting as CSV for external use

In [10]:
data = pd.read_csv('Data/Cleaned_Data.csv')
data = data.replace(np.nan, '', regex = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Summary          100000 non-null  object
 1   Text             100000 non-null  object
 2   Cleaned Summary  100000 non-null  object
 3   Cleaned Text     100000 non-null  object
dtypes: object(4)
memory usage: 3.1+ MB


In [11]:
for i in range(5):
    print("Review:",data['Cleaned Text'][i])
    print("Summary:",data['Cleaned Summary'][i])
    print("\n")

Review: i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than most
Summary: good quality dog food


Review: product arrived labeled as jumbo salted peanuts the peanuts were actually small sized unsalted not sure if this was an error or if the vendor intended to represent the product as jumbo
Summary: not as advertised


Review: this is a confection that has been around a few centuries it is a light pillowy citrus gelatin with nuts in this case filberts and it is cut into tiny squares and then liberally coated with powdered sugar and it is a tiny mouthful of heaven not too chewy and very flavorful i highly recommend this yummy treat if you are familiar with the story of c s lewis the lion the witch and the wardrobe this is the treat that seduces edmund into selling out his brother and sist

In [12]:
count = 0
for i in data['Cleaned Summary']:
    if(len(i.split()) <= 10):
        count = count + 1
print(count / len(data['Cleaned Summary']))

0.97373


In [13]:
max_summary_len = 10
max_text_len = 50

In [14]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(data['Cleaned Text'], data['Cleaned Summary'], test_size=0.2, random_state=0, shuffle=True) 

## Preparing Tokenizers

In [15]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

### Text Tokenizer

In [16]:
# initializing a tokenizer for reviews on training data
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(x_tr))

# any word whose word count falls below 4 would be considered a rare word
threshold = 4
count = 0
total_count = 0
frequency = 0
total_frequency = 0

for key, value in x_tokenizer.word_counts.items():
    total_count = total_count + 1
    total_frequency = total_frequency + value
    if(value < threshold):
        count = count + 1
        frequency = frequency + value
    
print("Total % of rare words in vocabulary: ", (count / total_count) * 100)
print("Total coverage of rare words in vocabulary: ", (frequency / total_frequency) * 100)

# preparing a tokenizer for reviews on training data
x_tokenizer = Tokenizer(num_words = total_count - count) 
x_tokenizer.fit_on_texts(list(x_tr))

# converting text sequences into integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

# padding zero upto maximum length
x_tr = pad_sequences(x_tr_seq,  maxlen = max_text_len, padding = 'post')
x_val = pad_sequences(x_val_seq, maxlen = max_text_len, padding = 'post')

#size of vocabulary ( +1 for padding token)
x_voc = x_tokenizer.num_words
print("Size of reviews vocabulary: ", x_voc)

Total % of rare words in vocabulary:  61.939539425502474
Total coverage of rare words in vocabulary:  0.6908463142909037
Size of reviews vocabulary:  18709


### Summary Tokenizer

In [17]:
# initialzing a tokenizer for summaries on training data
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(y_tr))

# any word whose word count falls below 4 would be considered a rare word
thresh = 4
count = 0
total_count = 0
frequency = 0
total_frequency = 0

for key, value in y_tokenizer.word_counts.items():
    total_count = total_count + 1
    total_frequency = total_frequency + value
    if(value < thresh):
        count = count + 1
        frequency = frequency + value
    
print("% of rare words in vocabulary:", (count / total_count) * 100)
print("Total Coverage of rare words:", (frequency / total_frequency) * 100)

# preparing a tokenizer for summaries on training data
y_tokenizer = Tokenizer(num_words = total_count-count) 
y_tokenizer.fit_on_texts(list(y_tr))

# converting text sequences into integer sequences
y_tr_seq = y_tokenizer.texts_to_sequences(y_tr) 
y_val_seq = y_tokenizer.texts_to_sequences(y_val) 

# padding zero upto maximum length
y_tr = pad_sequences(y_tr_seq, maxlen = max_summary_len, padding = 'post')
y_val = pad_sequences(y_val_seq, maxlen = max_summary_len, padding = 'post')

# size of vocabulary
y_voc = y_tokenizer.num_words
print("Size of summary vocabulary: ", y_voc)

% of rare words in vocabulary: 66.22066078636567
Total Coverage of rare words: 3.825748591090467
Size of summary vocabulary:  4519


## Model Building

In [18]:
data['Cleaned Summary'] = data['Cleaned Summary'].apply(lambda x : '_START_ '+ x + ' _END_')

In [19]:
print("Review:",data['Cleaned Text'][0])
print("\nSummary:",data['Cleaned Summary'][0])

Review: i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than most

Summary: _START_ good quality dog food _END_
