In [1]:
!pip install requests



In [2]:
!python -m pip install requests




[notice] A new release of pip available: 22.3 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import datetime




In [6]:
import datetime
import sqlite3
from time import sleep
file=open(f'errors{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.log', 'w')
# Connect to the database
conn = sqlite3.connect('DRUG_DB.db')
cursor = conn.cursor()

# Get the names of all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_names = cursor.fetchall()
pmid_procesados = set()

# Iterate over all tables
for table in table_names:
    table_name = table[0]

    # Check if the columns "species" and "diseases" do not exist in the current table
    cursor.execute(f"""PRAGMA table_info("{table_name}");""")
    columns = cursor.fetchall()
    has_species_column = any(col[1] == 'species' for col in columns)
    has_diseases_column = any(col[1] == 'diseases' for col in columns)

    # If they don't exist, add the "species" and "diseases" columns to the table
    if not has_species_column:
        cursor.execute(f"""ALTER TABLE "{table_name}" ADD COLUMN species TEXT;""")
    if not has_diseases_column:
        cursor.execute(f"""ALTER TABLE "{table_name}" ADD COLUMN diseases TEXT;""")

    # Get the values of "PMID" from the current table
    cursor.execute(f"""SELECT PMID FROM "{table_name}";""")
    pmid_values = [row[0] for row in cursor.fetchall()]

    # Update the data in the database for each PMID
    for i, pmid in enumerate(pmid_values):
        if pmid in pmid_procesados:
            print("Skipping already processed PMID:", pmid)
            continue  # If already processed, skip to the next iteration
        pmid_arr = [pmid]
        # Make the API call to get arrays of diseases and species
        try:
            species_mentions, diseases_mentions = get_species_diseases(pmid_arr)  # Function to get species and diseases data
        except Exception as e:
            file.write(f"\nERROR: {str(e)}.In PMID {pmid}")  
        else:  
            # Convert the arrays into strings separated by "|"
            diseases_str = "|".join(diseases_mentions)
            species_str = "|".join(species_mentions)

            # Update the row corresponding to the PMID with the new "species" and "diseases" columns
            cursor.execute(f"""UPDATE "{table_name}" SET species = ?, diseases = ? WHERE PMID = ?;""", (species_str, diseases_str, pmid))
                
            # Add the PMID to the set of processed PMIDs
            pmid_procesados.add(pmid)
                
            # Add a delay to avoid overwhelming the API (2 seconds in this case)
            sleep(2)

# Commit the changes to the database
conn.commit()

# Close the database connection
conn.close()
file.close()


Skipping already processed PMID: 25236765
Skipping already processed PMID: 1219629
Skipping already processed PMID: 8478997
Skipping already processed PMID: 16129921
Skipping already processed PMID: 25236765
Skipping already processed PMID: 1219629
Skipping already processed PMID: 8478997
Skipping already processed PMID: 16129921
Skipping already processed PMID: 25236765
Skipping already processed PMID: 1219629
Skipping already processed PMID: 8478997
Skipping already processed PMID: 16129921
Skipping already processed PMID: 25236765
Skipping already processed PMID: 1219629
Skipping already processed PMID: 8478997
Skipping already processed PMID: 16129921
Skipping already processed PMID: 25236765
Skipping already processed PMID: 1219629
Skipping already processed PMID: 8478997
Skipping already processed PMID: 16129921
Skipping already processed PMID: 25236765
Skipping already processed PMID: 1219629
Skipping already processed PMID: 8478997
Skipping already processed PMID: 16129921
Skip

In [5]:
import requests
def get_species_diseases(pmid):
    """
    Function to retrieve species and diseases mentions from BERN2 data for a given list of PMIDs.

    Parameters:
        pmid (list): A list of PubMed ID (PMID).

    Returns:
        tuple: A tuple containing two lists: species_mentions and diseases_mentions.
            - species_mentions (list): A list of species mentions extracted from the PubMed data.
            - diseases_mentions (list): A list of disease mentions extracted from the PubMed data.
    """

    def is_nan(value):
        """
        Helper function to check if a value is NaN.

        Parameters:
            value: The value to check for NaN.

        Returns:
            bool: True if the value is NaN, False otherwise.
        """
        return value != value


    def query_pmid(pmids, url="http://bern2.korea.ac.kr/pubmed"):
        """
        Function to make an API call to BERN2 and retrieve PubMed data for given PMIDs.

        Parameters:
            pmid (list or str): A PubMed ID (PMID) or a single string representing a PMID.
            url (str): The base URL of the PubMed API. Default is "http://bern2.korea.ac.kr/pubmed".

        Returns:
            dict: A dictionary containing the PubMed data for the given PMID.
        """
        return requests.get(url + "/" + ",".join(pmids)).json()

    # Query BERN2 data for the given PMID
    try:
        data = query_pmid(pmid)
    except Exception as e:
        print(f"ERROR: {str(e)}.In PMID {pmid}")
        file.write(f"\nERROR: {str(e)}.In PMID {pmid}")
    else:
        # Sets to store unique species and diseases mentions
        species_mentions_set = set()
        diseases_mentions_set = set()
        # Iterate through the BERN2 data to extract species and diseases mentions
        for item in data:
            annotations = item.get("annotations", [])  # Get the list of annotations for each item
            for annotation in annotations:
                # Check if the annotation is related to species and the probability is not NaN
                if annotation.get("obj") == "species" and not is_nan(annotation.get("prob")):
                    species_mentions_set.add(annotation.get("mention").lower())
                # Check if the annotation is related to disease and the probability is not NaN
                if annotation.get("obj") == "disease" and not is_nan(annotation.get("prob")):
                    diseases_mentions_set.add(annotation.get("mention").lower())

        # Convert sets to lists
        species_mentions = list(species_mentions_set)
        diseases_mentions = list(diseases_mentions_set)

        # Return the lists of species and diseases mentions
        return species_mentions, diseases_mentions
