# Abstraction-Based Text Summarization Using Sequence to Sequence Algorithm

In [1]:
import re
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

import nltk
from nltk.corpus import stopwords

In [2]:
import torch
import torch.nn as nn

In [3]:
plt.style.use('ggplot')

## Load Datasets

In [4]:
# datasets grasp from here www.kaggle.com/snap/amazon-fine-food-reviews
df_reviews = pd.read_csv("./datasets/amazon-fine-food-reviews.csv", nrows=100000)
df_reviews.index += 1
df_reviews[['Text', 'Summary']].head()

Unnamed: 0,Text,Summary
1,I have bought several of the Vitality canned d...,Good Quality Dog Food
2,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised
3,This is a confection that has been around a fe...,"""Delight"" says it all"
4,If you are looking for the secret ingredient i...,Cough Medicine
5,Great taffy at a great price. There was a wid...,Great taffy


## Preprocess Datasets

#### 1) Dropping duplicates and NaN values

In [None]:
df_reviews = df_reviews.drop_duplicates(subset=['Summary'])
df_reviews = df_reviews.dropna(axis=0)

In [None]:
df_reviews = df_reviews.drop_duplicates(subset=['Text'])
df_reviews = df_reviews.dropna(axis=0)

#### 2) Cleaning up stop words, contractions, non-alphanumeric and others

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
pickle_file = open('../../others/contraction_dictionary.pickle','rb')
contraction_dict = pickle.load(pickle_file)

In [None]:
def clean_up_text(sentence):
    sentence = sentence.lower()
    sentence = BeautifulSoup(sentence, 'lxml').text
    sentence = re.sub(r'\([^)]*\)', '', sentence)
    sentence = re.sub('"','', sentence)
    sentence = ' '.join([contraction_dict[word] if word in contraction_dict else word for word in sentence.split(' ')])
    sentence = re.sub(r"'s\b",'', sentence)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence) 
    
    tokens = [word for word in sentence.split() if not word in stop_words]
    words = []
    for word in tokens:
        if len(word) >= 3:
            words.append(word)
    
    return (' '.join(words)).strip()

In [None]:
cleaned_text = []
for sentence in tqdm_notebook(df_reviews['Text']): cleaned_text.append(clean_up_text(sentence))
df_reviews['Cleaned Text'] = cleaned_text

In [None]:
def clean_up_summary(sentence):
    sentence = re.sub('"', '', sentence)
    sentence = ' '.join([contraction_dict[word] if word in contraction_dict else word for word in sentence.split(' ')])
    sentence = re.sub(r"'s\b",'', sentence)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = sentence.lower()
    
    tokens = sentence.split()
    sentence = ''
    for word in tokens:
        if len(word) > 1:
            sentence += word + ' '
    
    return sentence

In [None]:
cleaned_summary = []
for sentence in tqdm_notebook(df_reviews['Summary']): cleaned_summary.append(clean_up_summary(sentence))
df_reviews['Cleaned Summary'] = cleaned_summary
df_reviews['Cleaned Summary'] = df_reviews['Cleaned Summary'].apply(lambda x : '_START_ ' + x + ' _END_')

In [None]:
df_reviews[['Text', 'Cleaned Text', 'Summary', 'Cleaned Summary']].head()

In [None]:
for i in range(5):
    print("Text:", df_reviews['Cleaned Text'][i+1])
    print("Summary:", df_reviews['Cleaned Summary'][i+1])
    print("\n")

#### 3) Setting maximum length of the reviews and the summary based on the distribution of sequences

In [None]:
text_word_count = []
summary_word_count = []

for sentence in df_reviews['Cleaned Text']: text_word_count.append(len(sentence.split()))
for sentence in df_reviews['Cleaned Summary']: summary_word_count.append(len(sentence.split()))

In [None]:
df_dist_sequences = pd.DataFrame({ 'Text': text_word_count, 'Summary': summary_word_count })

fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(1,2,2)
plt.title('Length Distribution of Text Sequences')
ax1.hist(x= df_dist_sequences['Text'], bins=30, color='Orange')
ax2 = fig.add_subplot(1,2,1)
plt.title('Length Distribution of Summary Sequences')
ax2.hist(x= df_dist_sequences['Summary'], bins = 30, color='Blue')

plt.savefig('./images/histogram-length-distribution-of-sequences')
plt.show()

In [None]:
MAX_LEN_TEXT = 80
MAX_LEN_SUMMARY = 10

#### 4) Splitting datasets into training and validation set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df_reviews['Cleaned Text'], df_reviews['Cleaned Summary'], test_size=0.1, shuffle=True, random_state=10)

#### 5) Preparing The Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# preparing a tokenizer for text on training data
X_tokenizer = Tokenizer()
X_tokenizer.fit_on_texts(list(X_train))

# converting text sequences into integer sequences
X_train = X_tokenizer.texts_to_sequences(X_train)
X_valid = X_tokenizer.texts_to_sequences(X_valid)

# padding zero up to maximum length
X_train = pad_sequences(X_train, maxlen=MAX_LEN_TEXT, padding='post')
X_valid = pad_sequences(X_valid, maxlen=MAX_LEN_TEXT, padding='post')

# calculating vocabulary size
X_vocab_size = len(X_tokenizer.word_index) + 1

In [None]:
# preparing a tokenizer for summary on training data
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_train))

# converting summary sequences into integer sequences
y_train = y_tokenizer.texts_to_sequences(y_train)
y_valid = y_tokenizer.texts_to_sequences(y_valid)

# padding zero up to maximum length
y_train = pad_sequences(y_train, maxlen=MAX_LEN_SUMMARY, padding='post')
y_valid = pad_sequences(y_valid, maxlen=MAX_LEN_SUMMARY, padding='post')

# calculating vocabulary size
y_vocab_size = len(y_tokenizer.word_index) + 1

## Build Seq2seq Network

---