# Paper Generator
This notebooks demonstrates the paper generation papeline we propose. We use langchain for interaction with the OpenAI API.

## Prequisits


In [37]:
%pip install langchain langchain-core langchain-community langchain_openai ipywidgets pydantic



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [37]:
import ipywidgets as widgets
from IPython.display import display, Markdown, display_pretty, JSON

In [1]:

OPENAI_API_KEY = input("Enter your OpenAI API key: ")


In [2]:


NOUGAT_URL = input("Enter Nougat URL: ")


In [3]:
if not OPENAI_API_KEY:
    raise ValueError("Please provide OpenAI API Key")
if not NOUGAT_URL:
    raise ValueError("Please provide Nougat URL")

## File Upload


In [4]:
uploader_base = widgets.FileUpload(
    accept='.pdf',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=False,  # True to accept multiple files upload else False
    description="Upload Base"
)
display(uploader_base)

uploader_similar = widgets.FileUpload(
    accept='.pdf',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=True,  # True to accept multiple files upload else False
    description="Upload Similar"
)
display(uploader_similar)



FileUpload(value=(), accept='.pdf', description='Upload Base')

FileUpload(value=(), accept='.pdf', description='Upload Similar', multiple=True)

## Nougat 

In [5]:
from requests import Response, post, get
file = uploader_base.value[0]
display(f'NOUGAT_URL: {NOUGAT_URL}')
display(f'Uploaded file: {file.name}')
headers = {
    "Accept": "application/json",
}
response = post(NOUGAT_URL + "/predict",
                            files={"file": file.content}, headers=headers)
response.raise_for_status()
if not response.ok:
    raise Exception("Error parsing PDF to Markdown")

response = response.json()


similar_papers = []
sim_files = uploader_similar.value
for file in sim_files:
    response = post(NOUGAT_URL + "/predict",
                                files={"file": file["content"]}, headers=headers)
    response.raise_for_status()
    if not response.ok:
        raise Exception("Error parsing PDF to Markdown")
    similar_papers.append(response.json())


'NOUGAT_URL: http://137.226.232.15:8503'

'Uploaded file: sulayman_corona.pdf'

In [6]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4-turbo")

## Table of content generation

In [7]:
topic = "Research in time of COVID-19"
context = "\n\n\n\n".join(similar_papers)
base_chapter = response

In [105]:
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel
from typing import List, Optional
from langchain_core.output_parsers import JsonOutputParser

class Section(BaseModel):
    title: str
    description: Optional[str] = ""
    subsections: Optional[List['Section']] = None
class ToC(BaseModel):
    sections: List[Section]
    

parser = JsonOutputParser(pydantic_object=ToC)

toc_template = """
You are a scientific researcher, an expert in crafting high-quality scientific documents.
You're trained across a wide range of scientific disciplines, enabling you to provide
specialized assistance across various topics.


Context: {context}


Base: {base_chapter}

Write the Table of Content for a paper with  the following topic: {topic}.
You can use the base as a starting point.
Use the context for information about the topic.


Only output the generated section, no additional information.
Make the output match the following object to generate a json later.
class Section(BaseModel):
    title: str
    description: Optional[str] = None
    subsections: Optional[List['Section']] = None
class ToC(BaseModel):
    sections: List[Section]
"""

toc_prompt = PromptTemplate.from_template(toc_template)

toc_chain =  toc_prompt | llm | parser

In [106]:
toc_response = toc_chain.invoke(input={"topic": topic, "context": context, "base_chapter": base_chapter})

In [107]:

def format_to_markdown(section, level=0):
    # Markdown header levels
    header = '#' * (level + 1)
    # Start with the section title
    markdown = f"{header} {section['title']}\n"
    # Add the section content if available
    if section.get('description'):
        markdown += f"{section['description']}\n"
    # Recursively format subsections if they exist
    if section.get('subsections'):
        for subsection in section['subsections']:
            markdown += format_to_markdown(subsection, level + 2)
    return markdown

# Generate markdown for each top-level section
markdown_output = ""
for top_section in toc_response['sections']:
    markdown_output += format_to_markdown(top_section)

display(toc_response)
display(Markdown(markdown_output))

{'sections': [{'title': 'Introduction',
   'description': 'Overview of the impacts of the COVID-19 pandemic on the global research landscape, setting the scene for the detailed analyses in subsequent sections.'},
  {'title': 'Data and Methods',
   'description': 'Description of the data sources, collection methods, and analytical techniques used to evaluate the impact of COVID-19 on research activities.'},
  {'title': 'Impact on Research Output',
   'subsections': [{'title': 'General Trends in Research Output',
     'description': 'Analysis of overall changes in research output volume and quality across different scientific fields during the pandemic.'},
    {'title': 'Field-Specific Impacts',
     'description': 'Detailed examination of how specific scientific fields were affected in terms of research output and collaboration.'}]},
  {'title': 'Changes in Research Collaboration',
   'subsections': [{'title': 'Collaboration Patterns',
     'description': 'Exploration of changes in rese

# Introduction
Overview of the impacts of the COVID-19 pandemic on the global research landscape, setting the scene for the detailed analyses in subsequent sections.
# Data and Methods
Description of the data sources, collection methods, and analytical techniques used to evaluate the impact of COVID-19 on research activities.
# Impact on Research Output
### General Trends in Research Output
Analysis of overall changes in research output volume and quality across different scientific fields during the pandemic.
### Field-Specific Impacts
Detailed examination of how specific scientific fields were affected in terms of research output and collaboration.
# Changes in Research Collaboration
### Collaboration Patterns
Exploration of changes in research collaboration patterns, including intramural, national, and international collaborations.
### Influence of Mobility Restrictions
Assessment of how travel and mobility restrictions influenced collaborative research efforts globally.
# Role of Digital and Open Access Platforms
Investigation into how digital platforms and open access policies facilitated continued research dissemination and collaboration during lockdowns.
# Gender and Geographical Disparities
Analysis of how the pandemic affected research productivity differently across gender lines and geographic locations.
# Policy Implications and Recommendations
Discussion of the implications of study findings for science policy and recommendations for supporting research during global emergencies.
# Conclusions
Summary of key findings, limitations of the study, and potential areas for future research.


## Generate Sections
We will now generate the sections based on the table of content above

In [108]:
section_template = """
You are a scientific researcher, an expert in crafting high-quality scientific documents.
You're trained across a wide range of scientific disciplines, enabling you to provide
specialized assistance across various topics.

Context: {context}

Base: {base_chapter}

Already generated content: {generated_text}

Write an {section} section for the following topic: {topic}.
You can use the base section and the already generated content as a starting point.
Use the context for information about the topic.

Make the generated output latex. Only output the generated section, no additional information.
"""

section_prompt = PromptTemplate.from_template(toc_template)

section_chain =  toc_prompt | llm | parser

In [111]:
sections = toc_response['sections']

for i, section in enumerate(sections):
    print(section)
    subsections = []
    if hasattr(object, 'subsections'):
        for j, subsection in enumerate(section["subsections"]):
            generated_text = subsection["generated_text"] or ""
            subsection_response = section_chain.invoke(input={"topic": topic, "context": context, "base_chapter": base_chapter, "generated_text": generated_text, "section": subsection["title"]})
            subsection["generated_text"] = subsection_response
            subsections.append(subsection)
    section_resonse = section_chain.invoke(input={"topic": topic, "context": context, "base_chapter": base_chapter, "generated_text": "\n\n".join(subsections), "section": section["title"]})
    section["generated_text"] = section_resonse
    

{'title': 'Introduction', 'description': 'Overview of the impacts of the COVID-19 pandemic on the global research landscape, setting the scene for the detailed analyses in subsequent sections.'}


NameError: name 'subsections' is not defined

In [None]:
JSON(sections)