In [1]:
import json
import requests
import nltk
import sys
import re
import heapq
import gensim

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from transformers import pipeline

from math import ceil

from gensim.summarization import summarize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viral\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
logging = True

In [3]:
def read_reviews():
    raw = requests.get("https://raw.githubusercontent.com/patelviralb/text-summarization/main/dataset/cornell_reviews.json").text.strip()
    corpus = [json.loads(line) for line in raw.split("\n")]
    
    return corpus

In [4]:
def vectorize_input_corpus(documents):
    documents = []
    classes = []
    stop_words = stopwords.words('english')

    for entry in corpus:
        documents.append(entry['text'])
        classes.append(entry['class'])

    vectorizer = CountVectorizer(input=documents, max_df=0.25, token_pattern=r'\b[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*\b', ngram_range=(1,3), max_features=300000, binary=True)
    count_vector = vectorizer.fit_transform(documents)

    vectors = count_vector.toarray()
#     vocab = vectorizer.get_feature_names()

#     return vectors, classes, vocab
    return vectors, documents, classes

In [5]:
def distribute_test_train_corpus(vectors, documents, classes):
    document_indices = [*range(0, len(documents), 1)]
    test_train_data_indces = train_test_split(document_indices, train_size = 0.75, random_state = 41)

    train_vectors = []
    train_documents = []
    train_classes = []

    for index in test_train_data_indces[0]:
        train_vectors.append(vectors[index])
        train_documents.append(documents[index])
        train_classes.append(classes[index])

    test_vectors = []
    test_documents = []
    test_classes = []

    for index in test_train_data_indces[1]:
        test_vectors.append(vectors[index])
        test_documents.append(documents[index])
        test_classes.append(classes[index])
    
    return train_vectors, train_documents, train_classes, test_vectors, test_documents, test_classes

In [6]:
def get_model(train_vectors, train_classes):
    logistic_regression_model = LogisticRegression(C=0.05, solver='liblinear', max_iter = 1000, penalty="l2")
    logistic_regression_model.fit(train_vectors, train_classes)
    
    return logistic_regression_model

In [7]:
def run_evaluation(model, test_vectors, test_classes):
    accuracy = accuracy_score(test_classes, model.predict(test_vectors))

    return accuracy

### Initial Accuracy Computation

Below code computes the baseline accuracy after dividing the corpus into training and test dataset. This accuracy will be used to compare with the accuracies generated after summarization

In [8]:
corpus = read_reviews()
vectors, documents, classes = vectorize_input_corpus(corpus)

In [9]:
train_vectors, train_documents, train_classes, test_vectors, test_documents, test_classes = distribute_test_train_corpus(vectors, documents, classes)

In [10]:
logistic_regression_model = get_model(train_vectors, train_classes)
baseline_accuracy = run_evaluation(logistic_regression_model, test_vectors, test_classes)

print("baseline_accuracy:\t{}".format(baseline_accuracy))

baseline_accuracy:	0.896


## Summarization

In [11]:
def vectorize_summary(summary_corpus):
    vectorizer = CountVectorizer(input=summary_corpus, max_df=0.25, token_pattern=r'\b[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*\b', ngram_range=(1,3), max_features=300000, binary=True)
    count_vector = vectorizer.fit_transform(summary_corpus)

    summary_vectors = count_vector.toarray()
    
    return summary_vectors

### Summarize Text using `Weighted Word Frequency`

In [12]:
def get_weighted_word_summary(document, summary_sentence_count = 10, max_words_in_sentence = sys.maxsize):
    original_text = document
    # Preprocessing
    formatted_text = re.sub(r'\s+', ' ',  re.sub('[^a-zA-Z]', ' ', document))
    # Converting Text To Sentences
    sentence_list = sent_tokenize(document)
    
    # Find Weighted Frequency of Occurrence
    stop_words = stopwords.words('english')
    word_frequencies = {}
    for word in word_tokenize(formatted_text):
        if word not in stop_words:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
    maximum_frequncy = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
    # Calculating Sentence Scores
    sentence_scores = {}
    for sentence in sentence_list:
        if len(sentence.split(' ')) < max_words_in_sentence:
            for word in word_tokenize(sentence.lower()):
                if word in word_frequencies.keys():
                    if sentence not in sentence_scores.keys():
                        sentence_scores[sentence] = word_frequencies[word]
                    else:
                        sentence_scores[sentence] += word_frequencies[word]
    
    # Getting the Summary
    summary_sentences = heapq.nlargest(summary_sentence_count, sentence_scores, key=sentence_scores.get)
    summarized_text = ' '.join(summary_sentences)
    
    return summarized_text

In [13]:
def create_weighted_word_summary_corpus(documents, summary_sentence_count = 10, max_words_in_sentence = sys.maxsize):
    document_summary = []
    for index, document in enumerate(documents):
        summary = get_weighted_word_summary(document, summary_sentence_count, max_words_in_sentence)
        document_summary.append(summary)
    
    return document_summary

In [14]:
def compute_weighted_word_summary_accuracy(train_documents, train_classes, test_documents, test_classes, summary_sentence_count = 10, max_words_in_sentence = sys.maxsize):
    train_summary = create_weighted_word_summary_corpus(train_documents, summary_sentence_count, max_words_in_sentence)
    test_summary = create_weighted_word_summary_corpus(test_documents, summary_sentence_count, max_words_in_sentence)
    
    summary_corpus = []
    summary_corpus.extend(train_summary)
    summary_corpus.extend(test_summary)

    summary_vectors = vectorize_summary(summary_corpus)
    train_summary_vectors = summary_vectors[0:1500]
    test_summary_vectors = summary_vectors[1500:]

    logistic_regression_model_after_summary = get_model(train_summary_vectors, train_classes)
    weighted_word_summary_accuracy = run_evaluation(logistic_regression_model_after_summary, test_summary_vectors, test_classes)

    return weighted_word_summary_accuracy

In [15]:
# summary_sentence_count = 7, max_words_in_sentence = 30
weighted_word_summary_accuracy = compute_weighted_word_summary_accuracy(train_documents, train_classes, test_documents, test_classes, summary_sentence_count = 7, max_words_in_sentence = 30)
print("weighted_word_summary_accuracy:\t{}".format(weighted_word_summary_accuracy))

weighted_word_summary_accuracy:	0.742


In [16]:
# summary_sentence_count = 7
weighted_word_summary_accuracy = compute_weighted_word_summary_accuracy(train_documents, train_classes, test_documents, test_classes, summary_sentence_count = 7)
print("weighted_word_summary_accuracy:\t{}".format(weighted_word_summary_accuracy))

weighted_word_summary_accuracy:	0.762


In [17]:
weighted_word_summary_accuracy = compute_weighted_word_summary_accuracy(train_documents, train_classes, test_documents, test_classes)
print("weighted_word_summary_accuracy:\t{}".format(weighted_word_summary_accuracy))

weighted_word_summary_accuracy:	0.784


### Summarize Text using `transformers.pipeline`

In [18]:
def get_transformers_pipeline_summary(original_text, minimum_length = 20, maximum_length = 200):
    original_text_length = len(original_text)
    
    summarization = pipeline("summarization")
    
    original_documents = []
    if original_text_length > 1024:
        total_range = ceil(original_text_length / 1024)
        i = 0
        while i < total_range:
            start = i * 1024
            end = (start + 1024) if i != total_range - 1 else original_text_length
            summarized_text = summarization(original_text[start:end], min_length = minimum_length, max_length = original_text_length if maximum_length > original_text_length else maximum_length)[0]['summary_text']
            original_documents.append(summarized_text)
            i += 1
        
        return " ".join(original_documents)
    
    summarized_text = summarization(original_text, min_length = minimum_length, max_length = maximum_length)[0]['summary_text']
    
    return summarized_text

In [19]:
def create_transformers_pipeline_summary_corpus(documents, min_length = 20, max_length = 200):
    document_summary = []
    for index, document in enumerate(documents):
        summary = get_transformers_pipeline_summary(document, min_length, max_length)
        document_summary.append(summary)
    
    return document_summary

In [20]:
def compute_transformers_pipeline_summary_accuracy(train_documents, train_classes, test_documents, test_classes, min_length = 20, max_length = 200):
    train_summary = create_transformers_pipeline_summary_corpus(train_documents, min_length, max_length)
    if logging == True: print("train_summary completed")
    test_summary = create_transformers_pipeline_summary_corpus(test_documents, min_length, max_length)
    if logging == True: print("test_summary completed")
    
    summary_corpus = []
    summary_corpus.extend(train_summary)
    summary_corpus.extend(test_summary)
    if logging == True: print("summary_corpus created")

    summary_vectors = vectorize_summary(summary_corpus)
    train_summary_vectors = summary_vectors[0:1500]
    test_summary_vectors = summary_vectors[1500:]
    if logging == True: print("summary_corpus distributed")

    logistic_regression_model_after_summary = get_model(train_summary_vectors, train_classes)
    if logging == True: print("logistic_regression_model_after_summary computed")
    transformers_pipeline_summary_accuracy = run_evaluation(logistic_regression_model_after_summary, test_summary_vectors, test_classes)
    if logging == True: print("transformers_pipeline_summary_accuracy computed")

    return transformers_pipeline_summary_accuracy

In [21]:
# min_length = 5, max_length = 20
# transformers_pipeline_summary_accuracy = compute_transformers_pipeline_summary_accuracy(train_documents, train_classes, test_documents, test_classes, 0, 20)
# print("transformers_pipeline_summary_accuracy:\t{}".format(transformers_pipeline_summary_accuracy))

In [22]:
# min_length = 20, max_length = 200
# transformers_pipeline_summary_accuracy = compute_transformers_pipeline_summary_accuracy(train_documents, train_classes, test_documents, test_classes, 0, 200)
# print("transformers_pipeline_summary_accuracy:\t{}".format(transformers_pipeline_summary_accuracy))

### Summarize Text using `TextRank`

In [23]:
def get_textrank_summary(original_text, ratio):
    try:
        summarized_text = summarize(original_text, ratio)
    except ValueError as v:
        return original_text

    return summarized_text

In [24]:
def create_textrank_summary_corpus(documents, ratio):
    document_summary = []
    for index, document in enumerate(documents):
        summary = get_textrank_summary(document, ratio)
        document_summary.append(summary)
    
    return document_summary

In [25]:
def compute_textrank_summary_accuracy(train_documents, train_classes, test_documents, test_classes, ratio=0.2):
    train_summary = create_textrank_summary_corpus(train_documents, ratio)
    if logging == True: print("train_summary completed")
    test_summary = create_textrank_summary_corpus(test_documents, ratio)
    if logging == True: print("test_summary completed")
    
    summary_corpus = []
    summary_corpus.extend(train_summary)
    summary_corpus.extend(test_summary)
    if logging == True: print("summary_corpus created")

    summary_vectors = vectorize_summary(summary_corpus)
    train_summary_vectors = summary_vectors[0:1500]
    test_summary_vectors = summary_vectors[1500:]
    if logging == True: print("summary_corpus distributed")

    logistic_regression_model_after_summary = get_model(train_summary_vectors, train_classes)
    if logging == True: print("logistic_regression_model_after_summary computed")
    transformers_pipeline_summary_accuracy = run_evaluation(logistic_regression_model_after_summary, test_summary_vectors, test_classes)
    if logging == True: print("transformers_pipeline_summary_accuracy computed")

    return transformers_pipeline_summary_accuracy

In [26]:
textrank_summary_accuray = compute_textrank_summary_accuracy(train_documents, train_classes, test_documents, test_classes, 0.2)
print("textrank_summary_accuray:\t{}".format(textrank_summary_accuray))

train_summary completed
test_summary completed
summary_corpus created
summary_corpus distributed
logistic_regression_model_after_summary computed
transformers_pipeline_summary_accuracy computed
textrank_summary_accuray:	0.742


In [30]:
textrank_summary_accuray = compute_textrank_summary_accuracy(train_documents, train_classes, test_documents, test_classes, 0.3)
print("textrank_summary_accuray:\t{}".format(textrank_summary_accuray))

train_summary completed
test_summary completed
summary_corpus created
summary_corpus distributed
logistic_regression_model_after_summary computed
transformers_pipeline_summary_accuracy computed
textrank_summary_accuray:	0.788


In [28]:
textrank_summary_accuray = compute_textrank_summary_accuracy(train_documents, train_classes, test_documents, test_classes, 0.5)
print("textrank_summary_accuray:\t{}".format(textrank_summary_accuray))

train_summary completed
test_summary completed
summary_corpus created
summary_corpus distributed
logistic_regression_model_after_summary computed
transformers_pipeline_summary_accuracy computed
textrank_summary_accuray:	0.818
