In [1]:
!pip install PyPDF2
!pip install docx2txt



In [18]:
import numpy as np
import PyPDF2
import docx2txt
import sys
import matplotlib.pyplot as plt
%matplotlib inline 
import networkx as nx
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [6]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [9]:
# we are going to show an example of how the method is working
# first let's take the document as an input
def readDoc():
    name = input('Please input a file name: ') 
    print('You have asked for the document {}'.format(name))

    # now read the type of document
    if name.lower().endswith('.txt'):
        choice = 1
    elif name.lower().endswith('.pdf'):
        choice = 2
    else:
        choice = 3
        # print(name)
    print(choice)
    # Case 1: if it is a .txt file
        
    if choice == 1:
        f = open(name, 'r')
        document = f.read()
        f.close()
            
    # Case 2: if it is a .pdf file
    elif choice == 2:
        pdfFileObj = open(name, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        pageObj = pdfReader.getPage(0)
        document = pageObj.extractText()
        pdfFileObj.close()
    
    # Case 3: none of the format
    else:
        print('Failed to load a valid file')
        print('Returning an empty string')
        document = ''
    
    print(type(document))
    return document

In [15]:
def abstractive_summarize(text,per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary
    # Add your abstractive summarization code here
    pass

 

In [20]:
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import networkx as nx

def extractive_summarize(text):
    def tokenize(document):
        doc_tokenizer = PunktSentenceTokenizer()
        return doc_tokenizer.tokenize(document)
    
    def process_document(document):
        sentences_list = tokenize(document)
        cv = CountVectorizer()
        cv_matrix = cv.fit_transform(sentences_list)
        normal_matrix = TfidfTransformer().fit_transform(cv_matrix)
        res_graph = normal_matrix * normal_matrix.T
        nx_graph = nx.from_scipy_sparse_matrix(res_graph)
        ranks = nx.pagerank(nx_graph)
        sentence_array = sorted(((ranks[i], s) for i, s in enumerate(sentences_list)), reverse=True)
        sentence_array = np.asarray(sentence_array)
        rank_max = float(sentence_array[0][0])
        rank_min = float(sentence_array[len(sentence_array) - 1][0])
        temp_array = []
        
        flag = 0
        if rank_max - rank_min == 0:
            temp_array.append(0)
            flag = 1
        
        if flag != 1:
            for i in range(0, len(sentence_array)):
                temp_array.append((float(sentence_array[i][0]) - rank_min) / (rank_max - rank_min))
        
        threshold = (sum(temp_array) / len(temp_array)) + 0.2
        sentence_list = []
        if len(temp_array) > 1:
            for i in range(0, len(temp_array)):
                if temp_array[i] > threshold:
                        sentence_list.append(sentence_array[i][1])
        else:
            sentence_list.append(sentence_array[0][1])
        
        summary = " ".join(str(x) for x in sentence_list)
        return summary
    
    summary = process_document(text)
    return summary


In [25]:

def main():
    print("Choose a summarization method:")
    print("1. Abstractive Summarization")
    print("2. Extractive Summarization")
    
    choice = input("Enter your choice (1 or 2): ")

    if choice == '1':
        text = readDoc()
        summary = abstractive_summarize(text,0.09)
        print("\nAbstractive Summary:")
        print(summary)
    elif choice == '2':
        text = readDoc()
        summary = extractive_summarize(text)
        print("\nExtractive Summary:")
        print(summary)
    else:
        print("Invalid choice. Please enter 1 or 2.")

if __name__ == "__main__":
    main()


Choose a summarization method:
1. Abstractive Summarization
2. Extractive Summarization


Enter your choice (1 or 2):  1
Please input a file name:  story1.txt


You have asked for the document story1.txt
1
<class 'str'>

Abstractive Summary:
People went to Panditji and asked him to
give the order for the old woman's oven to be rebuilt and the fire once more
lighted, but he paid no attention to them.The village folk customarily have one meal a day of
parched grains, so there was always a crowd around Bhungi's oven.
Then two servants arrived, each carrying a
heaped basket of grain from Pandit Udaybhan with the order to parch it right
away.Now when he
looked toward the old woman's oven he fell into a violent rage: it was being
made again.They began to band together to put out the fire
but the sprinkle of water acted like oil on it and the flames kept mounting
higher.The names leapt towards the
sky, the blaze spread wildly in all directions till the villagers came clustering 
around this mountain of fire.Pandit Udaybhan's splendid mansion was swallowed up; while he
watched, it tossed like a ship amid wild waves and disappeared in the sea of
fire.I

In [24]:
if __name__ == "__main__":
    main()

Choose a summarization method:
1. Abstractive Summarization
2. Extractive Summarization


Enter your choice (1 or 2):  2
Please input a file name:  story1.txt


You have asked for the document story1.txt
1
<class 'str'>

Extractive Summary:
But on the days when she had to parch grain for Pandit
Udaybhan Pandey, the owner of the village, she went to bed hungry. People went to Panditji and asked him to
give the order for the old woman's oven to be rebuilt and the fire once more
lighted, but he paid no attention to them. He kicked at the trough again but she ran in front of it
and took the kick in her side. 'If youâ€™re going to stay in the village you'll have to do my chores. She had to
keep stopping from the parching in order to keep the oven fire going. Bhungi was energetically rebuilding it with balls of clay Most
likely she'd spent the night at this work and wanted to finish it before the sun
was high. With this command the servants went away and Bhungi began to parch the
grain. Here she had
known the sorrows and pleasures of life; she could not give it up now in the
last days. â€˜To his attendants he said, 'Go get a pile of leaves right
awa


The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `from_scipy_sparse_array` instead.
  nx_graph = nx.from_scipy_sparse_matrix(res_graph)
