In [73]:
import json
import requests
import nltk
import re
import heapq
import numpy as np

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viral\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_reviews():
    raw = requests.get("https://raw.githubusercontent.com/patelviralb/text-summarization/main/dataset/cornell_reviews.json").text.strip()
    corpus = [json.loads(line) for line in raw.split("\n")]
    
    return corpus

In [3]:
def vectorize_input_corpus(documents):
    documents = []
    classes = []
    stop_words = stopwords.words('english')

    for entry in corpus:
        documents.append(entry['text'])
        classes.append(entry['class'])

    vectorizer = CountVectorizer(input=documents, max_df=0.25, token_pattern=r'\b[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*\b', ngram_range=(1,3), max_features=300000, binary=True)
    count_vector = vectorizer.fit_transform(documents)

    vectors = count_vector.toarray()
#     vocab = vectorizer.get_feature_names()

#     return vectors, classes, vocab
    return vectors, documents, classes

In [4]:
def distribute_test_train_corpus(vectors, documents, classes):
    document_indices = [*range(0, len(documents), 1)]
    test_train_data_indces = train_test_split(document_indices, train_size = 0.75, random_state = 41)
    # print("len(test_train_data_indces):\t{}".format(len(test_train_data_indces)))
    # print("len(test_train_data_indces[0]):\t{}".format(len(test_train_data_indces[0])))
    # print("len(test_train_data_indces[1]):\t{}".format(len(test_train_data_indces[1])))

    train_vectors = []
    train_documents = []
    train_classes = []

    for index in test_train_data_indces[0]:
        train_vectors.append(vectors[index])
        train_documents.append(documents[index])
        train_classes.append(classes[index])

    test_vectors = []
    test_documents = []
    test_classes = []

    for index in test_train_data_indces[1]:
        test_vectors.append(vectors[index])
        test_documents.append(documents[index])
        test_classes.append(classes[index])

    # print("len(train_vectors):\t{}".format(len(train_vectors)))
    # print("len(train_documents):\t{}".format(len(train_documents)))
    # print("len(train_classes):\t{}".format(len(train_classes)))

    # print("len(test_vectors):\t{}".format(len(test_vectors)))
    # print("len(test_documents):\t{}".format(len(test_documents)))
    # print("len(test_classes):\t{}".format(len(test_classes)))
    
    return train_vectors, train_documents, train_classes, test_vectors, test_documents, test_classes

In [5]:
def get_model(train_vectors, train_classes):
    logistic_regression_model = LogisticRegression(C=0.05, solver='liblinear', max_iter = 500, penalty="l2")
    logistic_regression_model.fit(train_vectors, train_classes)
    
    return logistic_regression_model

In [6]:
def run_evaluation(model, test_vectors, test_classes):
    accuracy = accuracy_score(test_classes, model.predict(test_vectors))

    return accuracy

### Initial Accuracy Computation

Below code computes the baseline accuracy after dividing the corpus into training and test dataset. This accuracy will be used to compare with the accuracies generated after summarization

In [7]:
corpus = read_reviews()
vectors, documents, classes = vectorize_input_corpus(corpus)

# print("type(vectors):\t{}".format(type(vectors)))
# print("type(documents):\t{}".format(type(documents)))
# print("type(classes):\t{}".format(type(classes)))

# print("len(vectors):\t{}".format(len(vectors)))
# print("len(documents):\t{}".format(len(documents)))
# print("len(classes):\t{}".format(len(classes)))

In [8]:
train_vectors, train_documents, train_classes, test_vectors, test_documents, test_classes = distribute_test_train_corpus(vectors, documents, classes)

In [9]:
logistic_regression_model = get_model(train_vectors, train_classes)
baseline_accuracy = run_evaluation(logistic_regression_model, test_vectors, test_classes)

print("baseline_accuracy:\t{}".format(baseline_accuracy))

baseline_accuracy:	0.896


### Summarize Text using Weighted Word Frequency

###### Preprocessing

In [68]:
def get_summary(document):
    original_text = document
    # Preprocessing
    formatted_text = re.sub(r'\s+', ' ',  re.sub('[^a-zA-Z]', ' ', document))
    # Converting Text To Sentences
    sentence_list = sent_tokenize(document)
    
#     print("original_text:\n{}".format(original_text))
#     print("formatted_text:\n{}".format(formatted_text))
#     print("sentence_list:\n{}".format(sentence_list))
    
    # Find Weighted Frequency of Occurrence
    stop_words = stopwords.words('english')
    word_frequencies = {}
    for word in word_tokenize(formatted_text):
        if word not in stop_words:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
#     print("word_frequencies:\n{}".format(word_frequencies))
    
    maximum_frequncy = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
#     print("word_frequencies:\n{}".format(word_frequencies))
    
    # Calculating Sentence Scores
    sentence_scores = {}
    for sentence in sentence_list:
        if len(sentence.split(' ')) < 1000:
            for word in word_tokenize(sentence.lower()):
                if word in word_frequencies.keys():
                    if sentence not in sentence_scores.keys():
                        sentence_scores[sentence] = word_frequencies[word]
                    else:
                        sentence_scores[sentence] += word_frequencies[word]
    
#     print("len(sentence_scores):\n{}".format(len(sentence_scores)))
#     print("sentence_scores:\n{}".format(sentence_scores))
    
    # Getting the Summary
    summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    
#     print("summary_sentences:\n{}".format(summary_sentences))
#     print("summary:\n{}".format(summary))
    
    return summary, len(sentence_list)

In [78]:
train_summary = []
train_sentence_count = []
for index, document in enumerate(train_documents):
    summary, sentence_count = get_summary(document)
    train_summary.append(summary)
    train_sentence_count.append(sentence_count)

print("len(train_sentence_count):\t{}".format(len(train_sentence_count)))
print("min(train_sentence_count):\t{}".format(min(train_sentence_count)))
print("argmin(train_sentence_count):\t{}".format(np.argmin(train_sentence_count)))
print("max(train_sentence_count):\t{}".format(max(train_sentence_count)))
print("argmax(train_sentence_count):\t{}".format(np.argmax(train_sentence_count)))

print("-"*50)
print("MIN train_documents[655]:\n{}".format(train_documents[655]))
print("MIN train_summary[655]:\n{}".format(train_summary[655]))
print("*"*50)
# print("len(train_summary):\n{}".format(len(train_summary)))
# print("train_summary:\n{}".format(train_summary))

test_summary = []
test_sentence_count = []
for index, document in enumerate(test_documents):
    summary, sentence_count = get_summary(document)
    test_summary.append(summary)
    test_sentence_count.append(sentence_count)

print("len(test_sentence_count):\t{}".format(len(test_sentence_count)))
print("min(test_sentence_count):\t{}".format(min(test_sentence_count)))
print("argmin(test_sentence_count):\t{}".format(np.argmin(test_sentence_count)))
print("max(test_sentence_count):\t{}".format(max(test_sentence_count)))
print("argmax(test_sentence_count):\t{}".format(np.argmax(test_sentence_count)))

print("-"*50)
print("MIN test_documents[247]:\n{}".format(test_documents[247]))
print("MIN test_summary[247]:\n{}".format(test_summary[247]))

# print("len(test_summary):\n{}".format(len(test_summary)))
# print("test_summary:\n{}".format(test_summary))

len(train_sentence_count):	1500
min(train_sentence_count):	1
argmin(train_sentence_count):	655
max(train_sentence_count):	188
argmax(train_sentence_count):	289
--------------------------------------------------
MIN train_documents[655]:
this film is extraordinarily horrendous and i'm not going to waste any more words on it . 
MIN train_summary[655]:
this film is extraordinarily horrendous and i'm not going to waste any more words on it .
**************************************************
len(test_sentence_count):	500
min(test_sentence_count):	5
argmin(test_sentence_count):	247
max(test_sentence_count):	172
argmax(test_sentence_count):	88
--------------------------------------------------
MIN test_documents[247]:
deserves recognition for : making this relatively youthful critic feel extremely old and crotchety20 capsule review : this is what feel-good family entertainment has morphed into in the 90's : an hour-and-a-half commercial , disguised as an unnecessary remake , in which the def

In [61]:
def vectorize_summary(summary_corpus):
    vectorizer = CountVectorizer(input=summary_corpus, max_df=0.25, token_pattern=r'\b[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*\b', ngram_range=(1,3), max_features=300000, binary=True)
    count_vector = vectorizer.fit_transform(summary_corpus)

    summary_vectors = count_vector.toarray()
    
    return summary_vectors

In [58]:
# < 30 words in sentence, 7 top sentences
summary_corpus = []
summary_corpus.extend(train_summary)
summary_corpus.extend(test_summary)

summary_vectors = vectorize_summary(summary_corpus)
train_summary_vectors = summary_vectors[0:1500]
test_summary_vectors = summary_vectors[1500:]

logistic_regression_model_after_summary = get_model(train_summary_vectors, train_classes)
weighted_average_summary_accuracy = run_evaluation(logistic_regression_model_after_summary, test_summary_vectors, test_classes)

print("weighted_average_summary_accuracy:\t{}".format(weighted_average_summary_accuracy))

weighted_average_summary_accuracy:	0.742


In [62]:
# < 1000 words in sentence, 7 top sentences
summary_corpus = []
summary_corpus.extend(train_summary)
summary_corpus.extend(test_summary)

summary_vectors = vectorize_summary(summary_corpus)
train_summary_vectors = summary_vectors[0:1500]
test_summary_vectors = summary_vectors[1500:]

logistic_regression_model_after_summary = get_model(train_summary_vectors, train_classes)
weighted_average_summary_accuracy = run_evaluation(logistic_regression_model_after_summary, test_summary_vectors, test_classes)

print("weighted_average_summary_accuracy:\t{}".format(weighted_average_summary_accuracy))

weighted_average_summary_accuracy:	0.762


0.762

In [66]:
# < 1000 words in sentence, 10 top sentences
summary_corpus = []
summary_corpus.extend(train_summary)
summary_corpus.extend(test_summary)

summary_vectors = vectorize_summary(summary_corpus)
train_summary_vectors = summary_vectors[0:1500]
test_summary_vectors = summary_vectors[1500:]

logistic_regression_model_after_summary = get_model(train_summary_vectors, train_classes)
weighted_average_summary_accuracy = run_evaluation(logistic_regression_model_after_summary, test_summary_vectors, test_classes)

print("weighted_average_summary_accuracy:\t{}".format(weighted_average_summary_accuracy))

weighted_average_summary_accuracy:	0.784
