In [None]:
!pip install pyodbc

In [2]:
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from transformers import pipeline
import pandas as pd
import torch
import sentencepiece
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pyodbc
import json
import random

In [4]:
#nltk.download('punkt') # only uncomment if not already downloaded
#nltk.download('stopwords')

In [12]:
#Translation & Summarization functions
ArticleToSummarize = "pdf_database\pdf3.pdf"

def pdfTE(pdfFile, version=1, start=0, end=0): # Function for text extraction from PDF. # Version states which version to use, with arguments(start, end) being the starting page and ending page.
    if version==1:
        with open(pdfFile, "rb") as file: # Read in binary to handle breakline statements better (\n)
            pdfReader = PdfReader(file)
            for page in pdfReader.pages:
                yield page.extract_text() # Use of generator as keeping the whole article in the memory results in memory error.
    else:
        with open(pdfFile, "rb") as file: # Read in binary to handle breakline statements better (\n)
            pdfReader = PdfReader(file)
            if end==0:
                end=len(pdfReader.pages)-1
            for num in range(start,end): # Iterator over page (num) 
                yield pdfReader.pages[num].extract_text() # Text extraction of page (num)
            
def articleC(pdfFile, version=1,start=0,end=0): # Function to return the article in one string # Version and start, end arguments necessary for pdfTE integration.
    if version==1:
        textCombiner = pdfTE(pdfFile)
    else:
        textCombiner = pdfTE(pdfFile,2,start,end) # Choosing for version 2 if sumArticle2 failed on version 1
    textCombined = ""
    for text in textCombiner: # Loop over generator object to sum text of pages into one string
        textCombined += text
    if textCombined == "":
        raise Exception("Oops! The task failed as the PDF file is empty.") # Exception that handles empty files
    elif len(textCombined) < 10:
        raise Exception("Oops! The task failed as the PDF file has too little characters.") # Exception that handles files with too little characters for a summary
    else:
        return textCombined

def sumArticle2(pdfFile): # Function to summarize article as a whole
    if pdfFile.endswith(".pdf"): # First check if file extension is ".pdf" format.
        try:
            summarizer = pipeline("summarization", model="pszemraj/led-large-book-summary") # Model used from the huggingface hub (https://huggingface.co/pszemraj/led-large-book-summary)
            articleCombined = articleC(pdfFile) # Iterate over text combiner function
            lengthArticle = len(articleCombined)
            summarizedPage = summarizer(articleCombined,max_length=1000, min_length=200, do_sample=True) # Max/min length for length of summarization in characters.
            return summarizedPage[0]["summary_text"] # Return summary as string
        except PdfReadError as PRE: # Second check if file extension is ".pdf" format when first check fails.
            return f"Oops! a PDF Read Error {PRE} happened. Please retry the task once the issue has been resolved."
        except OSError as OS: # Exception handling of OS errors, as PdfReadError doesn't catch all file extension errors.
            return f"Oops! an OS Error {OS} happened. Please retry the task once the issue has been resolved."
        except (RuntimeError, IndexError) as RIE: # Exception check if pdf fails to be summarized due to it having too many tokens (> 16384)
            print(f"Oops! {RIE}, The file was too big to configure.")
            try:
                print("Warning, this summary may take a while to produce.\n") # Give feedback to the user that because of the large pdf, the models computation will be larger
                start = 0
                summary = "" # Empty string to merge all the separate summaries in
                length = len(PdfReader(ArticleToSummarize).pages) # Amount of pages in article
                amountPages = (length//(lengthArticle//65536+1))+1 # Optimalization to determine best end to achieve least amount of computation (65536 amount of chars model can handle)
                end = amountPages
                lastRun = False
                while True: # Loop till all the pages have been summarized
                    articleCombined = articleC(pdfFile,2,start,end)
                    summarizedPage = summarizer(articleCombined,max_length=1000, min_length=200, do_sample=False) # Max/min length for length of summarization  per amount of pages in characters.
                    summary += f"{summarizedPage[0]['summary_text']} "
                    start += amountPages # Calculations to determine next start/end point of iteration.
                    end += amountPages
                    if lastRun or start>=length: # Checker if program needs to stop executing
                        break
                    if end>length or end==length: # Once summarizer reaches last page in summarization, it tells program to stop after one last run
                        end=length-1
                        lastRun = True
    
                summaryF = summarizer(summary,max_length=2000, min_length=200, do_sample=False)        
                return summaryF[0]['summary_text']
            except Exception as E: # Exception that handles all different errors and asks user to feedback the error to the devs to enhance the model.
                return f"Oops! An unexpected error occured, {E}. Please report the error to the team."
    else:
        raise Exception("The file format is not a valid pdf.")

def pdfKE(pdfFile, language='english'): # Function to extract keywords from the PDF, language input needed as language needs to be part of stopwords folder. Standard keyword is english
    if pdfFile.endswith(".pdf"): # First check if file extension is ".pdf" format.
        try:
            articleCombined = articleC(pdfFile).lower() # Iterate over generator
            tokens = word_tokenize(articleCombined) # Tokenize all words in the article
            punctuations = ["(",")",";",":","[","]",",","!","=","==","<",">","@","#","$","%","^","&","*",".","//","{","}","...","``","+","\'\'","-","~","\"","’",]
            stopWords = stopwords.words(f'{language}')
            keywords = [word for word in tokens if word not in stopWords and word not in punctuations] # Filter the words so that mostlikely keywords will be extracted
            keywordExtracted = pd.Series(keywords).value_counts().index[:5] # Keywords formatting as a list
            keywordDict = {i+1:keywordExtracted[i] for i in range(5)}
            return keywordDict
        except PdfReadError as PRE: # Second check if file extension is ".pdf" format when first check fails.
            return f"Oops! a PDF Read Error {PRE} happened. Please retry the task once the issue has been resolved."
        except OSError as OS: # Exception handling of OS errors, as PdfReadError doesn't catch all file extension errors.
            return f"Oops! an OS Error {OS} happened. Please retry the task once the issue has been resolved."
        except Exception as E: # Exception that handles all different errors and asks user to feedback the error to the devs to enhance the model.
            return f"Oops! An unexpected error occured, {E}. Please report the error to the team."
    else:
        raise Exception("The file format is not a valid pdf.")

def translArticle(textToTranslate):
    try:
        translation = pipeline("translation", model="Helsinki-NLP/opus-mt-en-nl") # Model used for the translation, imported from Huggingface (https://huggingface.co/Helsinki-NLP/opus-mt-en-nl)
        translatedText = translation(textToTranslate)[0]['translation_text']
        return translatedText
    except Exception as E: # Exception that handles all different errors and asks user to feedback the error to the devs to enhance the model.
        return f"Oops! An unexpected error occured, {E}. Please report the error to the team."

In [None]:
#Database functions
import pyodbc
import json
import random

# Replace 'your_connection_string' with your actual connection string
connection_string = r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=Summarticle_database.accdb;'

def save_paper(json_data):
    connection = pyodbc.connect(connection_string)
    cursor = connection.cursor()

    paper_title = json_data.get("paper", {}).get("title")
    
    # Check if the paper title is not already in the Paper table
    select_query = "SELECT * FROM papers WHERE title = ?"
    cursor.execute(select_query, (paper_title,))
    
    if not cursor.fetchone():
        # Paper title is not in the Paper table, save the paper
        insert_query = "INSERT INTO papers (title, authors, DOI, keywords) VALUES (?, ?, ?, ?)"
        cursor.execute(insert_query, (
            paper_title,
            json_data.get("paper", {}).get("authors"),
            json_data.get("paper", {}).get("DOI"),
            ', '.join(json_data.get("paper", {}).get("keywords", []))
        ))
        
        connection.commit()

    cursor.close()
    connection.close()

def save_summary(json_data):
    connection = pyodbc.connect(connection_string)
    cursor = connection.cursor()

    paper_title = json_data.get("summary", {}).get("title")
    
    # Check if the paper title is not already in the Summary table
    select_query = "SELECT * FROM summary WHERE title = ?"
    cursor.execute(select_query, (paper_title,))
    
    if not cursor.fetchone():
        # Paper title is not in the Summary table, save the summary
        insert_query = "INSERT INTO summary (title, summary_en, rating_en) VALUES (?, ?, ?)"
        cursor.execute(insert_query, (
            paper_title,
            json_data.get("summary", {}).get("summary_en"),
            json_data.get("summary", {}).get("rating_en", 0)
        ))
        
        connection.commit()
    else:
        # Paper title is already in the Summary table, save summary in a new row
        insert_query = "INSERT INTO summary (title, summary_en, rating_en) VALUES (?, ?, ?)"
        cursor.execute(insert_query, (
            paper_title,
            json_data.get("summary", {}).get("summary_en"),
            json_data.get("summary", {}).get("rating_en", 0)
        ))

        connection.commit()

    cursor.close()
    connection.close()

def read_summary(json_data):
    connection = pyodbc.connect(connection_string)
    cursor = connection.cursor()

    paper_title = json_data.get("title")
    language = json_data.get("language", "en")

    # Check whether JSON GET request is from summary_en or summary_nl
    if language == "en":
        select_query = "SELECT summary_en, rating_en FROM summary WHERE title = ? AND summary_en IS NOT NULL"
    elif language == "nl":
        select_query = "SELECT summary_nl, rating_nl FROM summary WHERE title = ? AND summary_nl IS NOT NULL"
    else:
        return None  # Invalid language
    
    cursor.execute(select_query, (paper_title,))
    result = cursor.fetchone()

    if result:
        # If there is a summary in the requested language, return it based on the rating
        summary, rating = result
        return summary if random.random() < rating / 10.0 else None  # Weighted random based on rating
    else:
        return None  # Create a new summary (you already have code for this)

    cursor.close()
    connection.close()

def read_keyword(text):
    connection = pyodbc.connect(connection_string)
    cursor = connection.cursor()

    keyword = text

    # Get keyword from JSON request
    select_query = "SELECT title FROM papers WHERE keywords LIKE ?"
    cursor.execute(select_query, ('%' + keyword + '%',))
    result = cursor.fetchall()

    if result:
        # If there are papers with the specified keyword, return the titles
        return [row[0] for row in result]
    else:
        return None

    cursor.close()
    connection.close()

def update_rating(json_data):
    connection = pyodbc.connect(connection_string)
    cursor = connection.cursor()

    paper_title = json_data.get("title")
    rating_type = json_data.get("rating_type")

    # If JSON-rating == thumbs-up, corresponding rating in table += 1
    # If JSON-rating == thumbs-down, corresponding rating in table -= 1
    if rating_type == "thumbs-up":
        update_query = "UPDATE summary SET rating_en = CASE WHEN rating_en + 1 > 10 THEN 10 ELSE rating_en + 1 END WHERE title = ?"
    elif rating_type == "thumbs-down":
        update_query = "UPDATE summary SET rating_en = CASE WHEN rating_en - 1 < 1 THEN 1 ELSE rating_en - 1 END WHERE title = ?"
    else:
        return None  # Invalid rating type

    cursor.execute(update_query, (paper_title,))
    connection.commit()

    cursor.close()
    connection.close()

def delete_summary():
    connection = pyodbc.connect(connection_string)
    cursor = connection.cursor()

    # If summary_rating is lower than 3, delete from Summary table
    delete_query = "DELETE FROM summary WHERE rating_en < 3"
    cursor.execute(delete_query)
    connection.commit()

    cursor.close()
    connection.close()

In [None]:
# Communicate the summary to the orchestrator with a POST request
orchestrator_url = "http://ORCHESTRATOR/summary"
request_data = {"paper_id": orch_data.get("paper_id"), "summary": text}
response = requests.post(orchestrator_url, json=request_data)

# Check the response status
if response.status_code == 200:
    print("Summary successfully communicated to the orchestrator.")
else:
    print(f"Error communicating summary to the orchestrator. Status code: {response.status_code}")

In [4]:
# IF GET REQUEST IS FOR ENGLISH SUMMARY, USE THE FOLLOWING CODE

#If summary in database --> get that
summary = read_summary(orch_data)
if summary != None:
    print summary

else: #If there is no summary for this article
    text = sumArticle2(ArticleToSummarize) #Summarize it using the AI model
    print(text) #return the summary
    save_paper(orch_data) #Save the paper
    save_summary(orch_data) #Save the summary

if orch_data.get("rating_type") != None:
    update_rating(orch_data) #update the rating, if there is some in the JSON
    delete_summary() #delete summaries with low ratings
    
#Communicate the summary to the orchestrator with a POST request 

Token indices sequence length is longer than the specified maximum sequence length for this model (28510 > 16384). Running this sequence through the model will result in indexing errors


Oops! index out of range in self, The file was too big to configure.



Your max_length is set to 2000, but your input_length is only 436. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=218)


This paper describes the robot Stanley, which won the 2005 DARPA Grand Challenge. Stanley is an autonomous, machine-learning robot designed to explore unre-heared, off-road terrain. It uses state-of-the-art artificial intelligence to predict future obstacles and make decisions quickly. The distinguishing feature of this study is the use of multiple machine learning algorithms to develop a prediction engine that outperforms previous estimates by more than a factor of ten. In both the 2004 and 2005 races, Stanley outperforms all other vehicles in terms of speed and accuracy. For the 2005 race, the safety of the vehicle poses a problem, so the team uses a combination of traditional radar, chemical/metabolic acid sensors, and several different types of sensing. These sensing devices are coupled with a "learning algorithm" that uses statistical methods such as averaging and batching. The goal of this approach is to predict when the vehicle will be closest to the object it is trying to reach

In [5]:
# IF GET REQUEST IS FOR RELATED ARTICLES, USE THE FOLLOWING CODE

# Extract keywords and output them in dictionary format
keywords = pdfKE(ArticleToSummarize)
print(keywords)

#Check if keywords exist in 
related = read_keyword(keywords)
if related != None:
    print(related)
    
if orch_data.get("rating_type") != None:
    update_rating(orch_data) #update the rating, if there is some in the JSON
    delete_summary() #delete summaries with low ratings
    
#Communicate the summary to the orchestrator with a POST request 

{1: 'stanley', 2: 'vehicle', 3: 'figure', 4: '/h20850', 5: 'online'}


In [13]:
# IF GET REQUEST IS FOR DUTCH SUMMARY, USE THE FOLLOWING CODE

#If there is a Dutch summary in the database --> get that
summary = read_summary(orch_data)
if summary != None:
    print summary
else: #Summarize and translate
    text = sumArticle2(ArticleToSummarize) #Summarize it using the AI model
    translation = translArticle(text)
    print(translation)

Dit document beschrijft de robot Stanley, die de 2005 DARPA Grand Challenge won. Stanley is een autonome, machine-learning robot ontworpen om onherhorende, off-road terrein te verkennen. Het maakt gebruik van state-of-the-art kunstmatige intelligentie om toekomstige obstakels te voorspellen en snel beslissingen te nemen. Het onderscheidende kenmerk van deze studie is het gebruik van meerdere machine learning algoritmen om een voorspelling motor te ontwikkelen die eerder schattingen overtreft met meer dan een factor tien. In zowel de 2004 en 2005 races, Stanley overtreft alle andere voertuigen in termen van snelheid en nauwkeurigheid. Voor de 2005 race, de veiligheid van het voertuig vormt een probleem, dus het team maakt gebruik van een combinatie van traditionele radar, chemische / metabolische zure sensoren, en verschillende soorten sensors. Deze sensoren zijn gekoppeld aan een "learning algoritme" dat gebruik maakt van statistische methoden zoals gemiddelde en batching. Het doel van