# PDF Acquisition

In [None]:
#@title Depandances
%%capture
!pip3 install -U scidownl
# Langchain and groq
!pip install langchain
!pip install langchain_groq

#installing marker
!pip install marker-pdf

In [4]:
from scidownl import scihub_download
import pandas as pd
import os
from google.colab import files

def download_paper():
    paper = input("Please enter the DOI of the paper here: ")
    paper_type = "doi"
    out = "./paper.pdf"
    # Uncomment and configure the proxies if needed
    # proxies = {
    #     'http': 'socks5://127.0.0.1:7890'
    # }
    scihub_download(paper, paper_type=paper_type, out=out)#, proxies=proxies)

def upload_paper():
    uploaded = files.upload()
    if not list(uploaded.keys())[0].endswith('.pdf'):
        raise ValueError('Uploaded file is not a PDF file.')
    os.rename(list(uploaded.keys())[0], 'paper.pdf')
    print("Paper uploaded successfully as 'paper.pdf'")

def main():
    choice = input("Choose an option (1 for DOI download, 2 for PDF upload): ")
    if choice == '1':
        download_paper()
    elif choice == '2':
        upload_paper()
    else:
        print("Invalid choice. Please enter 1 or 2.")

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

# Markdown Extraction with Marker

In [None]:
#@title extracting markdown
!marker_single /content/paper.pdf /content/output/ --batch_multiplier 2 --max_pages 20 --langs English

# Extracting the Methodology Section
**Objective:** Isolate the methodology section from the segmented markdown for further analysis.
- **Action Items:**
  - From the parsed sections, specifically extract the text under the "Methodology" heading.
  - Handle variations in heading titles like "Methods" or "Materials and Methods."


In [None]:
import re

def extract_outline(file_path):
    outline = []
    with open(file_path, 'r') as file:
        for line in file:
            header_match = re.match(r'^(#+)\s+(.*)', line)
            if header_match:
                level = len(header_match.group(1))
                title = header_match.group(2)
                outline.append((header_match.group(1), title))
    return outline

def print_outline(outline):
    for hashes, title in outline:
        print(hashes + ' ' + title)

file_path = 'manuscript.md'
outline = extract_outline(file_path)
print_outline(outline)


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from google.colab import userdata
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

In [None]:


chat = ChatGroq(
    temperature=0,
    model_name="llama3-8b-8192",
    max_tokens=50,
    groq_api_key=GROQ_API_KEY  # Optional if not set as an environment variable
)

system = """
###instructions###
You will receive an outline of a scientific paper in the format shown in the expected input. Extract and present only the highest level heading corresponding to the methods section, with no other output.

**Expected Input:**
#abstract
#introduction
#method
##method 1
#results
#conclusion

**Expected Output:**
#method

"""

text = f"""{outline}"""
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", text)])

chain = prompt | chat
response = chain.invoke({})
print(response.content)

# Chapter Two: Materials and Methods


In [None]:
#@title Extracting the method section from the Markdown file
import re

def extract_section(file_path, section_title):
    section_content = []
    inside_section = False
    section_level = None

    with open(file_path, 'r') as file:
        for line in file:
            header_match = re.match(r'^(#+)\s+(.*)', line)
            if header_match:
                level = len(header_match.group(1))
                title = header_match.group(2)

                if inside_section:
                    if level <= section_level:
                        break
                if header_match.group(1) + ' ' + title == section_title:
                    inside_section = True
                    section_level = level

            if inside_section:
                section_content.append(line)

    return ''.join(section_content)


file_path = 'manuscript.md'
section_title = response.content
method = extract_section(file_path, section_title)

# Creating Methodology Graphs with Graphviz
**Objective:** Visualize the methodology section as a workflow graph.
- **Action Items:**
  - Install Graphviz in your Python environment.
  - Analyze the methodology text to identify key steps and their interconnections.
  - Design and implement a Graphviz script to create a diagram that represents these steps. This may involve defining nodes for each step and edges to show the sequence or relationship between them.
  - Ensure that the generated graph can be easily displayed within your Streamlit app or Google Colab notebook.

In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

GROQ_API_KEY="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"


In [25]:
from prompts import system_prompt_1, system_prompt_2
from input import method_1, method_2

In [20]:
#@title creation of nodes and edges

chat = ChatGroq(
    temperature=0,
    model_name="mixtral-8x7b-32768",
    
    max_tokens=1024,
    groq_api_key=GROQ_API_KEY  # Optional if not set as an environment variable
)

system = system_prompt_1

text = f"""{method_2}"""
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", text)])

chain = prompt | chat
nande = chain.invoke({})


In [21]:
from prompts import system_prompt_1, system_prompt_2
from input import method_1, method_2

#@title writing the code for the graphviz

chat = ChatGroq(
    temperature=0,
    model_name="gemma2-9b-it",
    max_tokens=1024,
    groq_api_key=GROQ_API_KEY  # Optional if not set as an environment variable
)

system = system_prompt_2

text = f"""{nande.content}"""
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", text)])

chain = prompt | chat
llm_response = chain.invoke({})


In [22]:
#@title extract the code form the response and run it
import re

def extract_code(llm_response):
    # Define the pattern to match the Python code block
    pattern = re.compile(r'```python(.*?)```', re.DOTALL)

    # Search for the code block
    match = pattern.search(llm_response)

    if match:
        # Extract the code
        code = match.group(1).strip()
        return code
    else:
        return "No Python code found in the response."

# Example LLM response
code = extract_code(llm_response.content)
exec(code)

In [26]:
print(system_prompt_2)


### instructions ###
Your job is to write Python code that:
1. Reads the provided list of nodes and edges.
2. Utilizes Graphviz to create a directed graph.
3. Adds nodes to the graph with appropriate labels.
4. Adds edges between the nodes with descriptions of their relationships.
5. Sets aesthetic properties such as layout direction, size, shape, or other visual attributes.
7. Very important to output the results to high quality png image name "graph.png". 


