In [1]:
import json
import requests
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viral\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_reviews():
    raw = requests.get("https://raw.githubusercontent.com/patelviralb/text-summarization/main/dataset/cornell_reviews.json").text.strip()
    corpus = [json.loads(line) for line in raw.split("\n")]
    
    return corpus

In [3]:
def vectorize_input(documents):
    documents = []
    classes = []
    stop_words = stopwords.words('english')

    for entry in corpus:
        documents.append(entry['text'])
        classes.append(entry['class'])

    vectorizer = CountVectorizer(input=documents, max_df=0.25, token_pattern=r'\b[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*\b', ngram_range=(1,3), max_features=300000, binary=True)
    count_vector = vectorizer.fit_transform(documents)

    vectors = count_vector.toarray()
#     vocab = vectorizer.get_feature_names()

#     return vectors, classes, vocab
    return vectors, documents, classes

In [4]:
def distribute_test_train_corpus(vectors, documents, classes):
    document_indices = [*range(0, len(documents), 1)]
    test_train_data_indces = train_test_split(document_indices, train_size = 0.75, random_state = 41)
    # print("len(test_train_data_indces):\t{}".format(len(test_train_data_indces)))
    # print("len(test_train_data_indces[0]):\t{}".format(len(test_train_data_indces[0])))
    # print("len(test_train_data_indces[1]):\t{}".format(len(test_train_data_indces[1])))

    train_vectors = []
    train_documents = []
    train_classes = []

    for index in test_train_data_indces[0]:
        train_vectors.append(vectors[index])
        train_documents.append(documents[index])
        train_classes.append(classes[index])

    test_vectors = []
    test_documents = []
    test_classes = []

    for index in test_train_data_indces[1]:
        test_vectors.append(vectors[index])
        test_documents.append(documents[index])
        test_classes.append(classes[index])

    # print("len(train_vectors):\t{}".format(len(train_vectors)))
    # print("len(train_documents):\t{}".format(len(train_documents)))
    # print("len(train_classes):\t{}".format(len(train_classes)))

    # print("len(test_vectors):\t{}".format(len(test_vectors)))
    # print("len(test_documents):\t{}".format(len(test_documents)))
    # print("len(test_classes):\t{}".format(len(test_classes)))
    
    return train_vectors, train_documents, train_classes, test_vectors, test_documents, test_classes

In [5]:
def get_model(train_vectors, train_classes):
    logistic_regression_model = LogisticRegression(C=0.05, solver='liblinear', max_iter = 500, penalty="l2")
    logistic_regression_model.fit(train_vectors, train_classes)
    
    return logistic_regression_model

In [6]:
def run_evaluation(model, test_vectors, test_classes):
    accuracy = accuracy_score(test_classes, model.predict(test_vectors))

    return accuracy

### Initial Accuracy Computation

Below code computes the baseline accuracy after dividing the corpus into training and test dataset. This accuracy will be used to compare with the accuracies generated after summarization

In [7]:
corpus = read_reviews()
vectors, documents, classes = vectorize_input(corpus)

# print("type(vectors):\t{}".format(type(vectors)))
# print("type(documents):\t{}".format(type(documents)))
# print("type(classes):\t{}".format(type(classes)))

# print("len(vectors):\t{}".format(len(vectors)))
# print("len(documents):\t{}".format(len(documents)))
# print("len(classes):\t{}".format(len(classes)))

In [8]:
train_vectors, train_documents, train_classes, test_vectors, test_documents, test_classes = distribute_test_train_corpus(vectors, documents, classes)

In [9]:
logistic_regression_model = get_model(train_vectors, train_classes)
baseline_accuracy = run_evaluation(logistic_regression_model, test_vectors, test_classes)

print("baseline_accuracy:\t{}".format(baseline_accuracy))

baseline_accuracy:	0.896


### Summarize Text using Weighted Word Frequency

###### Preprocessing

In [33]:
import re

original_text = []
original_text.append(train_documents)
original_text.append(test_documents)

formatted_text = []
original_text_sentence_list = []

formatted_train_text = []
train_text_sentence_list = []
for document in train_documents:
    formatted_train_text.append(re.sub(r'\s+', ' ',  re.sub('[^a-zA-Z]', ' ', document)))
    train_text_sentence_list.append(nltk.sent_tokenize(document))
    

formatted_test_text = []
test_text_sentence_list = []
for document in test_documents:
    formatted_test_text.append(re.sub(r'\s+', ' ',  re.sub('[^a-zA-Z]', ' ', document)))
    test_text_sentence_list.append(nltk.sent_tokenize(document))

formatted_text.append(formatted_train_text)
formatted_text.append(formatted_test_text)

original_text_sentence_list.append(train_text_sentence_list)
original_text_sentence_list.append(test_text_sentence_list)

500

In [34]:
import bs4 as bs
import urllib.request
import re

scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scraped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)

formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

sentence_list = nltk.sent_tokenize(article_text)
nltk.word_tokenize(formatted_article_text)

['Artificial',
 'intelligence',
 'AI',
 'is',
 'intelligence',
 'demonstrated',
 'by',
 'machines',
 'unlike',
 'the',
 'natural',
 'intelligence',
 'displayed',
 'by',
 'humans',
 'and',
 'animals',
 'which',
 'involves',
 'consciousness',
 'and',
 'emotionality',
 'The',
 'distinction',
 'between',
 'the',
 'former',
 'and',
 'the',
 'latter',
 'categories',
 'is',
 'often',
 'revealed',
 'by',
 'the',
 'acronym',
 'chosen',
 'Strong',
 'AI',
 'is',
 'usually',
 'labelled',
 'as',
 'artificial',
 'general',
 'intelligence',
 'AGI',
 'while',
 'attempts',
 'to',
 'emulate',
 'natural',
 'intelligence',
 'have',
 'been',
 'called',
 'artificial',
 'biological',
 'intelligence',
 'ABI',
 'Leading',
 'AI',
 'textbooks',
 'define',
 'the',
 'field',
 'as',
 'the',
 'study',
 'of',
 'intelligent',
 'agents',
 'any',
 'device',
 'that',
 'perceives',
 'its',
 'environment',
 'and',
 'takes',
 'actions',
 'that',
 'maximize',
 'its',
 'chance',
 'of',
 'successfully',
 'achieving',
 'its',
 