## Summarizing a set of scientific papers


## To extract info from scientific pdf papers:

used https://github.com/UCREL/science_parse_py_api/tree/master/

> pip install science_parse_api

> docker run -p 127.0.0.1:8080:8080 --rm --init ucrel/ucrel-science-parse:3.0.1


In [3]:
from pathlib import Path
import pandas as pd
import json

from tqdm import tqdm

papers_dir = r"C:\Users\elba_ro\Documents\projects\github\dissertation\chapter2_related_work\subfolder"

#temp_path = Path(papers_dir,  'kiesel2022-identifying-the-human-values-behind-arguments.pdf').resolve()

In [4]:
import pprint
from science_parse_api.api import parse_pdf
from langchain import LLMChain

host = 'http://127.0.0.1'
port = '8080'
#output_dict = parse_pdf(host, temp_path, port=port)

#pp = pprint.PrettyPrinter(indent=4)
#pp.pprint(output_dict)

In [5]:
def content_only(_dict: dict) -> str :
    all_content = ""
    for section in _dict["sections"]:
        
        heading_str = f"{section['heading']}\n" if "heading" in section.keys()  and len(section['heading'])>0 else ""
        text_str = f"{section['text']}" if "text" in section.keys() and len(section['text'])>0  else ""
        combined = f"{heading_str}{text_str}" 
        
        all_content = f"{all_content}\n" if len(all_content) > 0 and len(combined)> 0 else all_content
        all_content = f"{all_content}{combined}"
    return all_content


In [6]:
#text = content_only(output_dict)
#print(text)

In [7]:
import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('OPENAI_API_KEY')

def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [8]:
# Create a new OpenAI instance
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

_ChatOpenAI_ = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def create_prompt_template(prompt):
    system_message_prompt = SystemMessagePromptTemplate.from_template(prompt)
    human_template="{scientific_content}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
    return [system_message_prompt, human_message_prompt]

In [18]:
from glob import glob
import json
import pandas as pd

def summarize_scientific_papers(source_path:str) -> pd.DataFrame:

    pdfs: list[str] = glob(f"{source_path}/*.pdf")
    print(f"Summarizing papers from the dir {source_path} which contains {len(pdfs)}.")
    
    chatgpt_outputs = []
    
    #The text is delimited with triple backticks.
    num = 1
    for paper in tqdm(pdfs):
        try:
            prompt = """
            Your task is to provide a comprehensive summary of a scientific paper.

            The result is intended for academic researchers.

            1. Create a short summary for the paper.
            2. Define the type of datasets used in the experiments, if any. Otherwise NA.
            3. Extract the top 5 topics of this text
            4. Infer the top 5 1-3words topics of this text
            5. Define in one sentence the main task tackled in this text
            6. Does this paper mention argumentation? answer by True or False.

            Format your response as a JSON object with \
            "summary", "dataset_type", "top_5_topics", "top_short_topics", "task" and "is_argumentative".

            text: 
            """
            path = Path(paper).resolve()
            output_dict = parse_pdf(host, path, port=port)
            text = content_only(output_dict)

            #prompt.format(text = text)
            #prompt = prompt if len(prompt) <= 4097 else prompt[0:4093]+"'''"

            lc_prompt = ChatPromptTemplate.from_messages(create_prompt_template(prompt))
            llm_chain = LLMChain(llm=_ChatOpenAI_, prompt=lc_prompt)
            
            _MAX_ = 4097 - len(prompt)
            text_truncated = text[0:_MAX_] if len(text) > _MAX_ else text
            response = llm_chain.run(scientific_content = text_truncated)# 'get_completion(prompt)
            response_dict = json.loads(response)
            response_dict.update(output_dict)

            chatgpt_outputs.append(response_dict)
            num = num+1
            if num>3: break

        except Exception as e:
            print(e)
            print(f"Failed to get a response for the paper: {paper}")   
        
    df = pd.DataFrame(chatgpt_outputs)
    df.to_csv(f"{source_path}/chatgpt_summary.csv")
    return df



In [19]:
df = summarize_scientific_papers(papers_dir)

Summarizing papers from the dir C:\Users\elba_ro\Documents\projects\github\dissertation\chapter2_related_work\subfolder which contains 26.


  8%|▊         | 2/26 [00:52<10:30, 26.26s/it]


In [20]:
from IPython.display import display, HTML

display(df)

Unnamed: 0,summary,dataset_type,top_5_topics,top_short_topics,task,is_argumentative,abstractText,authors,id,references,sections,title,year
0,The paper discusses the challenges in assessin...,,"[argumentation quality, computational argument...","[argumentation, quality, computational, resear...",Defining a common ground for assessing argumen...,True,Research on computational argumentation faces ...,"[{'affiliations': [], 'name': 'Henning Wachsmu...",SP:183a8c7eff441560158c7ebee5cfc11554362b54,"[{'authors': ['Richard Correnti', 'Lindsay Cla...",[{'text': 'Proceedings of the 15th Conference ...,Computational Argumentation Quality Assessment...,2017
1,The paper benchmarks the quality dimensions of...,English debate portal arguments on 16 topics s...,"[argument quality assessment, computational li...","[argument, quality, text, supervised, NLP]",The main task tackled in this text is to bench...,True,Several quality dimensions of natural language...,"[{'affiliations': [], 'name': 'Henning Wachsmu...",SP:393ae96a3463be78f1f1e195a180de093698e119,"[{'authors': ['Charu C. Aggarwal', 'ChengXiang...",[{'text': 'Proceedings of the 28th Internation...,Intrinsic Quality Assessment of Arguments,2020
2,The paper discusses the need for adaptive argu...,Student-written persuasive pitches,"[Adaptive argumentation support systems, Argum...","[Argumentation support, Mining, Linguistics, P...",The main task tackled in this text is the deve...,True,We introduce an argumentation annotation appro...,"[{'affiliations': [], 'name': 'Thiemo Wambsgan...",SP:a37de7234771ca8baf10262bbd764dcd5c82378a,"[{'authors': ['Albert Bandura.'], 'title': 'So...",[{'text': 'Proceedings of the 60th Annual Meet...,Modeling Persuasive Discourse to Adaptively Su...,2022


In [16]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df


Unnamed: 0,summary,dataset_type,top_5_topics,top_short_topics,task,is_argumentative,abstractText,authors,id,references,sections,title,year
0,The paper discusses the importance of assessin...,,"[Assessing argumentation quality, Computationa...","[Argumentation quality, Computational approach...",Defining a common ground for assessing argumen...,True,Research on computational argumentation faces ...,"[{'affiliations': [], 'name': 'Henning Wachsmu...",SP:06a08af6efb871cd8a94b483e071914e9c859046,"[{'authors': ['Richard Correnti', 'Lindsay Cla...",[{'text': 'Proceedings of the 15th Conference ...,Computational Argumentation Quality Assessment...,2017
