In [1]:
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from transformers import pipeline
import pandas as pd
import torch
import sentencepiece
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [4]:
#nltk.download('punkt') # only uncomment if not already downloaded
#nltk.download('stopwords')

In [24]:
ArticleToSummarize = "pdf_database\pdf3.pdf"

def pdfTE(pdfFile, version=1, start=0, end=0): # Function for text extraction from PDF. # Version states which version to use, with arguments(start, end) being the starting page and ending page.
    if version==1:
        with open(pdfFile, "rb") as file: # Read in binary to handle breakline statements better (\n)
            pdfReader = PdfReader(file)
            for page in pdfReader.pages:
                yield page.extract_text() # Use of generator as keeping the whole article in the memory results in memory error.
    else:
        with open(pdfFile, "rb") as file: # Read in binary to handle breakline statements better (\n)
            pdfReader = PdfReader(file)
            if end==0:
                end=len(pdfReader.pages)-1
            for num in range(start,end): # Iterator over page (num) 
                yield pdfReader.pages[num].extract_text() # Text extraction of page (num)
            
def articleC(pdfFile, version=1,start=0,end=0): # Function to return the article in one string # Version and start, end arguments necessary for pdfTE integration.
    if version==1:
        textCombiner = pdfTE(pdfFile)
    else:
        textCombiner = pdfTE(pdfFile,2,start,end) # Choosing for version 2 if sumArticle2 failed on version 1
    textCombined = ""
    for text in textCombiner: # Loop over generator object to sum text of pages into one string
        textCombined += text
    if textCombined == "":
        raise Exception("Oops! The task failed as the PDF file is empty.") # Exception that handles empty files
    elif len(textCombined) < 10:
        raise Exception("Oops! The task failed as the PDF file has too little characters.") # Exception that handles files with too little characters for a summary
    else:
        return textCombined

def sumArticle2(pdfFile): # Function to summarize article as a whole
    if pdfFile.endswith(".pdf"): # First check if file extension is ".pdf" format.
        try:
            summarizer = pipeline("summarization", model="pszemraj/led-large-book-summary") # Model used from the huggingface hub (https://huggingface.co/pszemraj/led-large-book-summary)
            articleCombined = articleC(pdfFile) # Iterate over text combiner function
            lengthArticle = len(articleCombined)
            summarizedPage = summarizer(articleCombined,max_length=1000, min_length=200, do_sample=True) # Max/min length for length of summarization in characters.
            return summarizedPage[0]["summary_text"] # Return summary as string
        except PdfReadError as PRE: # Second check if file extension is ".pdf" format when first check fails.
            return f"Oops! a PDF Read Error {PRE} happened. Please retry the task once the issue has been resolved."
        except OSError as OS: # Exception handling of OS errors, as PdfReadError doesn't catch all file extension errors.
            return f"Oops! an OS Error {OS} happened. Please retry the task once the issue has been resolved."
        except (RuntimeError, IndexError) as RIE: # Exception check if pdf fails to be summarized due to it having too many tokens (> 16384)
            print(f"Oops! {RIE}, The file was too big to configure.")
            try:
                print("Warning, this summary may take a while to produce.\n") # Give feedback to the user that because of the large pdf, the models computation will be larger
                start = 0
                summary = "" # Empty string to merge all the separate summaries in
                length = len(PdfReader(ArticleToSummarize).pages) # Amount of pages in article
                amountPages = (length//(lengthArticle//65536+1))+1 # Optimalization to determine best end to achieve least amount of computation (65536 amount of chars model can handle)
                print(amountPages)
                end = amountPages
                lastRun = False
                while True: # Loop till all the pages have been summarized
                    print(f"Beginning is {start}, {end}")
                    articleCombined = articleC(pdfFile,2,start,end)
                    print("ArticleCombined done")
                    summarizedPage = summarizer(articleCombined,max_length=1000, min_length=200, do_sample=False) # Max/min length for length of summarization  per amount of pages in characters.
                    print("SummarizedPage done")
                    summary += f"{summarizedPage[0]['summary_text']} "
                    print("Summary added")
                    start += amountPages # Calculations to determine next start/end point of iteration.
                    print(f"Start is {start}")
                    end += amountPages
                    print(f"End is {end}")
                    print("Incrementer done")
                    if lastRun or start>=length: # Checker if program needs to stop executing
                        print("Start>=length check done")
                        break
                    if end>length or end==length: # Once summarizer reaches last page in summarization, it tells program to stop after one last run
                        end=length-1
                        print("End>length check done")
                        lastRun = True
                        print("lastRun set to True")
                print("While loop done")        
                summaryF = summarizer(summary,max_length=2000, min_length=200, do_sample=False)        
                return summaryF[0]['summary_text']
            except Exception as E: # Exception that handles all different errors and asks user to feedback the error to the devs to enhance the model.
                return f"Oops! An unexpected error occured, {E}. Please report the error to the team."
    else:
        raise Exception("The file format is not a valid pdf.")

def pdfKE(pdfFile, language='english'): # Function to extract keywords from the PDF, language input needed as language needs to be part of stopwords folder. Standard keyword is english
    if pdfFile.endswith(".pdf"): # First check if file extension is ".pdf" format.
        try:
            articleCombined = articleC(pdfFile).lower() # Iterate over generator
            tokens = word_tokenize(articleCombined) # Tokenize all words in the article
            punctuations = ["(",")",";",":","[","]",",","!","=","==","<",">","@","#","$","%","^","&","*",".","//","{","}","...","``","+","\'\'","-","~","\"","’",]
            stopWords = stopwords.words(f'{language}')
            keywords = [word for word in tokens if word not in stopWords and word not in punctuations] # Filter the words so that mostlikely keywords will be extracted
            keywordExtracted = pd.Series(keywords).value_counts().index[:5] # Keywords formatting as a list
            keywordDict = {i+1:keywordExtracted[i] for i in range(5)}
            return keywordDict
        except PdfReadError as PRE: # Second check if file extension is ".pdf" format when first check fails.
            return f"Oops! a PDF Read Error {PRE} happened. Please retry the task once the issue has been resolved."
        except OSError as OS: # Exception handling of OS errors, as PdfReadError doesn't catch all file extension errors.
            return f"Oops! an OS Error {OS} happened. Please retry the task once the issue has been resolved."
        except Exception as E: # Exception that handles all different errors and asks user to feedback the error to the devs to enhance the model.
            return f"Oops! An unexpected error occured, {E}. Please report the error to the team."
    else:
        raise Exception("The file format is not a valid pdf.")

In [25]:
# Summarize text
text = sumArticle2(ArticleToSummarize)
print(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (28510 > 16384). Running this sequence through the model will result in indexing errors


Oops! index out of range in self, The file was too big to configure.

17
Beginning is 0, 17
ArticleCombined done
SummarizedPage done
Summary added
Start is 17
End is 34
Incrementer done
End>length check done
Beginning is 17, 31
ArticleCombined done


Your max_length is set to 2000, but your input_length is only 436. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=218)


SummarizedPage done
Summary added
Start is 34
End is 48
Incrementer done
Start>=length check done
While loop done
This paper describes the robot Stanley, which won the 2005 DARPA Grand Challenge. Stanley is an autonomous, machine-learning robot designed to explore unre-heared, off-road terrain. It uses state-of-the-art artificial intelligence to predict future obstacles and make decisions quickly. The distinguishing feature of this study is the use of multiple machine learning algorithms to develop a prediction engine that outperforms previous estimates by more than a factor of ten. In both the 2004 and 2005 races, Stanley outperforms all other vehicles in terms of speed and accuracy. For the 2005 race, the safety of the vehicle poses a problem, so the team uses a combination of traditional radar, chemical/metabolic acid sensors, and several different types of sensing. These sensing devices are coupled with a "learning algorithm" that uses statistical methods such as averaging and batc

In [26]:
# Extract keywords and output them in dictionary format
keywords = pdfKE(ArticleToSummarize)
print(keywords)

{1: 'stanley', 2: 'vehicle', 3: 'figure', 4: '/h20850', 5: 'online'}


In [27]:
# Summarization translation
translation = pipeline("translation", model="Helsinki-NLP/opus-mt-en-nl") # Model used for the translation, imported from Huggingface (https://huggingface.co/Helsinki-NLP/opus-mt-en-nl)
translatedText = translation(text)[0]['translation_text']
print(translatedText)

Dit document beschrijft de robot Stanley, die de 2005 DARPA Grand Challenge won. Stanley is een autonome, machine-learning robot ontworpen om onherhorende, off-road terrein te verkennen. Het maakt gebruik van state-of-the-art kunstmatige intelligentie om toekomstige obstakels te voorspellen en snel beslissingen te nemen. Het onderscheidende kenmerk van deze studie is het gebruik van meerdere machine learning algoritmen om een voorspelling motor te ontwikkelen die eerder schattingen overtreft met meer dan een factor tien. In zowel de 2004 en 2005 races, Stanley overtreft alle andere voertuigen in termen van snelheid en nauwkeurigheid. Voor de 2005 race, de veiligheid van het voertuig vormt een probleem, dus het team maakt gebruik van een combinatie van traditionele radar, chemische / metabolische zure sensoren, en verschillende soorten sensors. Deze sensoren zijn gekoppeld aan een "learning algoritme" dat gebruik maakt van statistische methoden zoals gemiddelde en batching. Het doel van