# preprations

In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

from prompts import system_prompt_1, system_prompt_2

import re
import os 
import importlib
from IPython.display import Markdown

from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq



# setup google gemini api keys
import os
from dotenv import load_dotenv
import platform

if platform.system() == "Windows":
    load_dotenv()
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
elif platform.system() == "Linux":
    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GOOGLE_AI_STUDIO2')
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
else:
    raise OSError("Unsupported operating system")


pro = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=GOOGLE_API_KEY, temperature=0.4, convert_system_message_to_human=True)
flash = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY, temperature=0.3, convert_system_message_to_human=True)

llm = ChatGroq(
    temperature=0,
    # model_name="mixtral-8x7b-32768",
    # model_name="llama3-70b-8192",
    model_name="llama-3.1-70b-versatile",
    # model_name="mixtral-8x7b-32768",
    # model_name="llama3-8b-8192",
    # model_name="llama-3.1-8b-instant",
    # model_name="gemma2-9b-it",
    
    groq_api_key=GROQ_API_KEY 
)

# PDF Acquisition

In [None]:
#@title Depandances
%%capture
!pip3 install -U scidownl
# Langchain and groq
!pip install langchain
!pip install langchain_groq

#installing marker
!pip install marker-pdf

In [4]:
from scidownl import scihub_download
import pandas as pd
import os
from google.colab import files

def download_paper():
    paper = input("Please enter the DOI of the paper here: ")
    paper_type = "doi"
    scihub_download(paper, paper_type=paper_type, out=out)#, proxies=proxies)

def upload_paper():
    uploaded = files.upload()
    if not list(uploaded.keys())[0].endswith('.pdf'):
        raise ValueError('Uploaded file is not a PDF file.')
    os.rename(list(uploaded.keys())[0], 'paper.pdf')
    print("Paper uploaded successfully as 'paper.pdf'")

def main():
    choice = input("Choose an option (1 for DOI download, 2 for PDF upload): ")
    if choice == '1':
        download_paper()
    elif choice == '2':
        upload_paper()
    else:
        print("Invalid choice. Please enter 1 or 2.")

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

# Markdown Extraction with Marker

In [None]:
#@title extracting markdown
!marker_single /content/paper.pdf /content/output/ --batch_multiplier 2 --max_pages 20 --langs English

# Workflow Graph genration feature

In [5]:
llm = ChatGroq(
    temperature=0,
    # model_name="mixtral-8x7b-32768",
    # model_name="llama3-70b-8192",
    # model_name="llama-3.1-70b-versatile",
    # model_name="mixtral-8x7b-32768",
    # model_name="llama3-8b-8192",
    model_name="llama-3.1-8b-instant",
    # model_name="gemma2-9b-it",
    
    groq_api_key=GROQ_API_KEY 
)

In [18]:
# this function gernrate a graph.png that contain the workflow of a paper 
# extracted form it's method

import functions
importlib.reload(functions)

paper_path = 'Examples/pocket-similarity-approach/pocket-similarity-approach.md'
functions.get_workflow(paper_path,llm)

import graphviz

# Define nodes and edges
nodes = [
    "Drug Retrieval - Obtain approved drugs from drugbank.ca.",
    "HET Code Acquisition - Obtain HET codes of drugs from ligand-expo.rcsb.org.",
    "HET Code Filtering - Filter HET codes to exclude compounds appearing in more than 100 structures.",
    "RCSB Database Search - Search RCSB database using filtered HET codes.",
    "PDB Retrieval - Retrieve pdbs from RCSB database.",
    "PDB Cleaning - Clean pdbs using Python script to leave only drug and binding chain.",
    "Drug Separation - Separate drugs in single pdb structures using Python script.",
    "Pocket Detection - Detect pockets in pdbs using Fpocket.",
    "Pocket Analysis - Analyze pockets using Python script to calculate center of mass.",
    "Drug Center of Mass Calculation - Calculate center of mass of each drug.",
    "Distance Calculation - Calculate distance between pocket center of mass and drug center of mass.",
    "Smallest Distance Selection - Select small

# Comperhencive sammary

In [24]:
llm = ChatGroq(
    temperature=0,
    # model_name="mixtral-8x7b-32768",
    # model_name="llama3-70b-8192",
    # model_name="llama-3.1-70b-versatile",
    # model_name="mixtral-8x7b-32768",
    # model_name="llama3-8b-8192",
    model_name="llama-3.1-8b-instant",
    # model_name="gemma2-9b-it",
    
    groq_api_key=GROQ_API_KEY 
)

In [3]:
## Genrtae compaerhicive summary from each section of the paper
import summary
import importlib
importlib.reload(summary)

file_path = 'Examples/systematic_review/systematic_review.md'
results = summary.summarize(file_path, llm)
Markdown(results)


##  Background and Objectives   
The COVID-19 pandemic, caused by the SARS-CoV-2 virus, has become a global health crisis. The study aims to investigate the potential of repurposing FDA-approved drugs against SARS-CoV-2 virus using molecular docking and pharmacophore modeling.
##  Methods                     
**Methodology Summary:**

* Design: Systematic search of scientific studies on Google Scholar
* Sample: All studies published by the end of 2020/12/30
* Tools/Techniques: 
  - Keywords: (COVID19 OR SARS-CoV-2) AND (Drug repurposing OR Drug repositioning OR Drug re-profling OR Drug rediscovery) AND (Docking AND Molecular dynamic)
  - Database: Google Scholar
##  Main Findings               
* 1,444 studies on repurposing drugs for COVID-19 were collected from Google Scholar by the end of 2020/12/30.
##  Analysis and Interpretation 
The authors used a systematic search strategy in Google Scholar to collect studies on drug repurposing for COVID-19, using a combination of keywords. The search was limited to studies published by the end of 2020/12/30.

The authors did not mention any specific statistical methods used in the analysis. However, the results of the search strategy are likely to be presented in the form of a list of studies, which may be analyzed using descriptive statistics (e.g., frequency, percentage) to summarize the findings.

The implications of the results are not explicitly stated, but the authors likely aim to provide an overview of the existing literature on drug repurposing for COVID-19, which can inform future research and clinical decisions.
##  Contributions and Novelty   
The paper contributes to the field of COVID-19 research by providing a comprehensive search strategy for identifying studies on repurposing drugs for COVID-19. The novel approach proposed is the use of a combination of keywords in the Google Scholar database to efficiently collect relevant studies.
##  Limitations                 
The study may be limited by the reliance on Google Scholar, which may not be comprehensive in capturing all relevant studies, particularly those published in non-English languages or in non-indexed journals. Additionally, the search strategy may be biased towards studies that used docking and molecular dynamics, potentially overlooking other repurposing methods.
##  Conclusions and Recommendations 
The researchers conclude that antiviral drugs, particularly those targeting the main protease, show promise in the fight against COVID-19. Tetracyclines and antivirals, originally protease inhibitors, are recommended for further investigation. The models developed in this study can be used for virtual screening to aid in finding effective therapeutic agents against COVID-19.
##  Key References and Citations 
Number of citaiton: 108

# Critizim

In [7]:
## create a critique of the method section 
### critizim on 3 area 
####clarity and reproducibility
#### 
import crq
import importlib
importlib.reload(crq)
file_path = 'Examples/pocket-similarity-approach/pocket-similarity-approach.md'
Markdown(crq.Critique(file_path, llm))


##  **Clarity and Reproducibility**
### Critique of the Method Section

**1. Clarity and Reproducibility**

- **Overall Evaluation**: The method section lacks detailed descriptions of procedures, specifics about materials and equipment, and clear step-by-step protocols. While the abstract provides a general overview of the study's aim and approach, the method section does not offer sufficient information for replication.

- **Detailed Findings**:
  - **Procedure Descriptions**: The method section does not provide detailed descriptions of the procedures used. For example, it mentions using PocketMatch to test the similarity of approved drugs' binding sites against all of the binding sites found on SARS CoV2 proteins, but it does not explain how PocketMatch was used, what parameters were set, or how the results were interpreted. Similarly, the docking process is mentioned, but the specifics of the docking protocol, such as the software used, parameters, and how the results were validated, are not provided.
  - **Materials and Equipment**: The method section does not list the necessary materials and equipment used in the study. For example, it does not specify the software versions, hardware specifications, or any other tools that were used.
  - **Step-by-Step Protocols**: The method section does not provide clear step-by-step protocols for the experiments or analyses. The order of the steps is not logical, and it is unclear how the different stages of the methodology are connected.
  - **Ambiguities Identified**: Several ambiguous terms and vague descriptions were identified, such as "all of the resolved SARS CoV2 crystal structures" (which specific structures were used?), "binding sites found on SARS CoV2 proteins" (how were these sites identified?), and "docking was used to confirm the results" (what specific docking protocol was used?).

# Graphs critizim

In [7]:
## critzim of the graphs
import importlib 
import img_crq 
importlib.reload(img_crq)
img_path = 'Examples/pocket-similarity-approach/1_image_0.png'
folder_path = 'Examples/pocket-similarity-approach/'
results = img_crq.process_graphs_in_folder(folder_path)
Markdown(results)


 **image file:**
***11_image_0.png*** 

**Figure (5) shows the steroidal pocket with the matching of secondary side chains**
   - **Relevance:** The image is relevant to the paper, as it illustrates the steroidal pocket and the matching of secondary side chains. 
   - **Clarity:** The image is cluttered and difficult to interpret. The different colors and shapes are not clearly labeled or explained, making it hard to understand what is being shown. The image is also quite small, which makes it difficult to see the details. 
   - **Accuracy:** The accuracy of the image cannot be assessed without more information.
   - **Presentation:** The image would benefit from a larger size, clearer labeling, and a more focused view. A simplified representation with clearer labeling of the different components would improve clarity.
   - **Annotation:**  The image lacks annotation. It is not clear what the different colors and shapes represent.
   - **Consistency:** The image is inconsistent with other figures in the paper.
   - **Statistical Representation:**  Not applicable.

**Conclusion:**

The image provided is a 3D model of a protein structure. It shows the steroidal pocket and the matching of secondary side chains. The image is relevant to the paper but lacks clarity and annotation. It is unclear what the different colors and shapes represent. A simplified representation with clearer labeling and a larger size would improve the image's clarity. 

 **image file:**
***1_image_0.png*** 

## Figure (1) showing the workflow of the study.

**- Relevance:** This figure is highly relevant as it effectively visualizes the workflow of the study, providing a clear overview of the steps involved in identifying potential drug targets for SARS-CoV-2.

**- Clarity:** The figure is clear and well-organized. The use of boxes and arrows to represent the different steps and their connections is intuitive and easy to follow. However, the lack of labels for the arrows makes it slightly less clear what each step involves.

**- Accuracy:** The figure accurately depicts the steps of the study based on the information provided in the text. The numbers indicating the steps are consistent with the description in the caption.

**- Presentation:** The figure is aesthetically pleasing, with a simple and consistent design. The use of black boxes on a white background creates a clear visual hierarchy. However, the grey background on the right side of the figure is distracting and unnecessary. 

**- Annotation:** The figure lacks sufficient annotation. While the numbers provide a basic outline, it would be more informative to include short descriptions or labels for each box explaining what each step entails. 

**- Consistency:** The style of the figure is consistent with the overall presentation of the paper.

**- Statistical Representation:** This figure doesn't require statistical representation as it primarily visualizes the steps of the workflow. 

**Conclusion:**

Overall, Figure (1) is a valuable addition to the paper, effectively illustrating the workflow of the study. However, it could be further improved by adding labels to the arrows and providing more detailed annotations for each step. Additionally, removing the distracting grey background on the right side of the figure would enhance the visual clarity. 

 **image file:**
***5_image_0.png*** 

## Graph Analysis of Molecular Interactions:

This figure displays 3D molecular structures, providing a visual representation of interactions between various compounds and the 'Main Protease' protein. However, as this is a structural representation rather than a graph in the traditional sense, it is not feasible to apply standard graph criticism methods.

Instead, we can assess the figure based on its clarity, information conveyed, and effectiveness in supporting the research:

**Overall:**

- **Clarity:** The figure is well-organized and visually appealing. The individual molecules are distinct, and the color scheme helps differentiate components.
- **Information:** Each image effectively shows the spatial orientation of the molecule within the active site of the protease, highlighting potential interactions. 
- **Support:**  The figure supports the research by providing visual evidence of the interactions between the compounds and the protein. However, it lacks specific annotations or quantitative data to further analyze these interactions.

**Strengths:**

- **Visual clarity:** The 3D representation makes it easy to grasp the spatial relationship between the molecules and the protein.
- **Color scheme:** The use of color differentiates atoms and functional groups, aiding understanding of the chemical environment.
- **Organization:** The figure layout is clear and facilitates comparison of interactions across different compounds.

**Areas for Improvement:**

- **Annotations:**  Adding annotations to identify key amino acid residues and/or specific interaction types (e.g., hydrogen bonds, hydrophobic interactions) would greatly enhance the figure's informative value.
- **Quantitative data:**  Including quantitative measures such as distances between interacting groups or binding affinities would provide a more complete picture of the interactions.
- **Legend:** Providing a legend explaining the color coding for atoms and functional groups would improve accessibility.

**Conclusion:**

The figure effectively showcases the 3D interactions between the compounds and the 'Main Protease' protein. However, further annotations and quantitative information are needed to fully exploit the potential of this visual representation and provide more substantial support for the research conclusions. 

 **image file:**
***7_image_0.png*** 

**Figure 1: Nsp12.**

- **Relevance:**  This figure depicts molecular interactions of the Nsp12 protein with various drugs. While relevant to the paper, it lacks context.  Without information about the research question and the significance of these interactions, their relevance remains unclear.
- **Clarity:**  The figure is clear. The labels are legible, and the 3D representations of the molecules are easily distinguishable. However, the figure would be even more informative if it included details about the binding site or specific interactions.
- **Accuracy:**  It's impossible to assess the accuracy of the molecular depictions without more information about the methodology used.  The paper should provide details about the software used, the structure of the protein, and the chemical structures of the drugs.
- **Presentation:** The figure's layout is organized.  However, it would benefit from a more informative caption.  The caption should briefly describe the figure's content, the significance of the interactions, and the purpose of the figure.
- **Annotation:** There are minimal annotations beyond the drug names.  More detailed annotations, such as highlighting specific interactions, would greatly enhance the figure's explanatory power.
- **Consistency:**  There is no indication of inconsistency.  
- **Statistical Representation:**  Statistical representation is not applicable to this type of figure.

**Conclusion:**

The figure is visually clear and provides a preliminary representation of drug-protein interactions. However, its relevance and accuracy are difficult to assess without additional information provided in the paper. The lack of detailed annotations and a comprehensive figure caption limits its overall effectiveness.  The authors should include a more in-depth explanation of the figure's content, its significance in relation to the research question, and the methods used to generate the visualizations. 

 **image file:**
***8_image_0.png*** 

**6w6y_A_1_Indinavir**
   - **Relevance:** Relevant to the paper's main points as it demonstrates the predicted pocket and docked compound.
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound.  
   - **Accuracy:** It's difficult to assess accuracy without further context or data. The image appears to be accurate in its representation of the docked compound within the pocket.
   - **Presentation:**  The use of 3D representation is appropriate for showcasing the interaction between the compound and the pocket. 
   - **Annotation:**  Each structure is labeled correctly and clearly. 
   - **Consistency:** Consistency in style, colors, and labels across the figures. 
   - **Statistical Representation:** Not applicable.

**6w6y_A_1_Trifluoperazine**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**6wcF_A_1_Lasofoxifene**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**6wcF_A_1_Raloxifene**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**6wey_A_3_Ouabain**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**6vxs_A_1_Amprenavir**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**6vxs_A_1_Nelfinavir**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**6vxs_A_1_Lopinavir**
   - **Relevance:** Relevant to the paper's main points. 
   - **Clarity:** The graph is well-organized, with clear labels for each structure and the compound. 
   - **Accuracy:** Difficult to assess without more information, but the visual representation is appropriate.
   - **Presentation:**  3D representation is suitable.
   - **Annotation:**  The figure is well-annotated.
   - **Consistency:** Consistent with other figures.
   - **Statistical Representation:** Not applicable.

**Conclusion:**
The graphs in the paper are visually appealing, well-organized, and clearly labeled.  The use of 3D representation is appropriate for visualizing the interactions of the docked compounds within the predicted pocket. However, without additional data or context, it's difficult to assess the accuracy of the individual figures. 


# API

In [29]:
import summary
import crq
import img_crq
import functions
import os

def paper_analyzer(paper_path, llm):
    folder_path = os.path.dirname(paper_path)
    # graphs_crq = img_crq.process_graphs_in_folder(folder_path)
    graphs_crq = ''
    method_crq = crq.Critique(paper_path, llm)
    summarization = summary.summarize(paper_path, llm)
    functions.get_workflow(paper_path, llm)
    return graphs_crq, method_crq, summarization

paper_path = 'Examples/pocket-similarity-approach/pocket-similarity-approach.md'

graphs_crq, method_crq, summarization = paper_analyzer(paper_path, llm)

import graphviz

# Define nodes and edges
nodes = [
    "Drug Retrieval - Obtain approved drugs from drugbank.ca.",
    "HET Code Acquisition - Obtain HET codes of drugs from ligand-expo.rcsb.org.",
    "HET Code Filtering - Filter HET codes to exclude compounds appearing in more than 100 structures.",
    "RCSB Database Search - Search RCSB database using filtered HET codes.",
    "PDB Retrieval - Retrieve pdbs from RCSB database.",
    "PDB Cleaning - Clean pdbs using Python script to leave only drug and binding chain.",
    "Drug Separation - Separate drugs in single pdb structures using Python script.",
    "Pocket Detection - Detect pockets in pdbs using Fpocket.",
    "Pocket Analysis - Analyze pockets using Python script to calculate center of mass.",
    "Drug Center of Mass Calculation - Calculate center of mass of each drug.",
    "Distance Calculation - Calculate distance between pocket center of mass and drug center of mass.",
    "Smallest Distance Selection - Select small