In [1]:
!pip install PyPDF2
!pip install docx2txt



In [2]:
import numpy as np
from PyPDF2 import PdfReader
import docx2txt
import sys
import matplotlib.pyplot as plt
%matplotlib inline 
import networkx as nx
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [3]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [5]:
from PyPDF2 import PdfReader

def read_doc():
    name = input('Please input a file name: ')
    print(f'You have asked for the document {name}')

    # now read the type of document
    if name.lower().endswith('.txt'):
        choice = 1
    elif name.lower().endswith('.pdf'):
        choice = 2
    else:
        choice = 3
        # print(name)
    print(choice)
    
    # Case 1: if it is a .txt file
    if choice == 1:
        with open(name, 'r') as file:
            document = file.read()
    
    # Case 2: if it is a .pdf file
    elif choice == 2:
        pdf_reader = PdfReader(name)
        page_obj = pdf_reader.pages[0]
        document = page_obj.extract_text()
    
    # Case 3: none of the formats
    else:
        print('Failed to load a valid file')
        print('Returning an empty string')
        document = ''
    
    print(type(document))
    return document

 


In [6]:
def abstractive_summarize(text,per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary
    

 

In [7]:
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import networkx as nx

def extractive_summarize(text):
    def tokenize(document):
        doc_tokenizer = PunktSentenceTokenizer()
        return doc_tokenizer.tokenize(document)
    
    def process_document(document):
        sentences_list = tokenize(document)
        cv = CountVectorizer()
        cv_matrix = cv.fit_transform(sentences_list)
        normal_matrix = TfidfTransformer().fit_transform(cv_matrix)
        res_graph = normal_matrix * normal_matrix.T
        nx_graph = nx.from_scipy_sparse_matrix(res_graph)
        ranks = nx.pagerank(nx_graph)
        sentence_array = sorted(((ranks[i], s) for i, s in enumerate(sentences_list)), reverse=True)
        sentence_array = np.asarray(sentence_array)
        rank_max = float(sentence_array[0][0])
        rank_min = float(sentence_array[len(sentence_array) - 1][0])
        temp_array = []
        
        flag = 0
        if rank_max - rank_min == 0:
            temp_array.append(0)
            flag = 1
        
        if flag != 1:
            for i in range(0, len(sentence_array)):
                temp_array.append((float(sentence_array[i][0]) - rank_min) / (rank_max - rank_min))
        
        threshold = (sum(temp_array) / len(temp_array)) + 0.2
        sentence_list = []
        if len(temp_array) > 1:
            for i in range(0, len(temp_array)):
                if temp_array[i] > threshold:
                        sentence_list.append(sentence_array[i][1])
        else:
            sentence_list.append(sentence_array[0][1])
        
        summary = " ".join(str(x) for x in sentence_list)
        return summary
    
    summary = process_document(text)
    return summary


In [11]:
def main():
    print("Choose a summarization method:")
    print("1. Abstractive Summarization")
    print("2. Extractive Summarization")
    
    choice = input("Enter your choice (1 or 2): ")

    if choice == '1':
        text = read_doc()
        summary = abstractive_summarize(text,0.09)
        print("\nAbstractive Summary:")
        print(summary)
    elif choice == '2':
        text = read_doc()
        summary = extractive_summarize(text)
        print("\nExtractive Summary:")
        print(summary)
    else:
        print("Invalid choice. Please enter 1 or 2.")

if __name__ == "__main__":
    main()


Choose a summarization method:
1. Abstractive Summarization
2. Extractive Summarization


Enter your choice (1 or 2):  2
Please input a file name:  3.pdf


You have asked for the document 3.pdf
2
<class 'str'>

Extractive Summary:
Under no c ircumstances 
shall McGraw-Hill and/or its licensors be liable for any indirect, incidental, special, punitive, consequential or similar damages that result from the use of or inability to use the work, even if any of them has been advised of the possibility of such damages. However, because of the possibility of human or 
mechanical error by our sources, McGraw-Hill, or others, McGraw-Hill does not guarantee the accuracy, adequacy, or completeness of any information and is not responsible for any errors or omissions or the results obtained from the use of such information. THE WORK IS PROVIDED “AS IS.” McGRAW-HILL AND ITS LICENSORS MAKE NO GUARANTEES OR WARRANTIES AS 
TO THE ACCURACY , ADEQUACY OR COMPLETENESS OF OR RESULTS TO BE OBTAINED FROM USING THE WORK, INCLUDING ANY INFORMATION THAT CAN BE ACCESSED THROUGH THE WORK VIA HYPERLINK OR OTHERWISE, AND EXPRESSLY DISCLAIM ANY WARRANTY , EXPRESS OR IM


The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `from_scipy_sparse_array` instead.
  nx_graph = nx.from_scipy_sparse_matrix(res_graph)


In [None]:
if __name__ == "__main__":
    main()