# Extract descriptors

## Packages

In [1]:
from mordred import Calculator, descriptors
from rdkit import Chem
import pandas as pd
import sqlite3
import json
import datetime
import pubchempy as pcp

In [2]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

## Code

### Main

In [15]:
if __name__ == "__main__":
    """
    Main function to update the descriptors column in a final SQLite database with molecular descriptors
    calculated from an initial SQLite database.

    The function connects to the initial SQLite database specified by 'db_file_initial' and retrieves
    DrugBank IDs and corresponding molecular descriptors. It then creates or updates a final SQLite database
    specified by 'db_file_final', storing the calculated molecular descriptors.

    Parameters:
        None (Note: The paths to the initial and final databases are specified within the function).

    Returns:
        None
    """
    file=open(f'errors{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.log', 'w')
    # Set the file paths for the initial and final SQLite databases
    db_file_initial = "DRUG_DB.db"  # Replace with the path to your initial SQLite database
    db_file_final = "BD_DESCRIPTORS.db"  # Replace with the path to your final SQLite database

    # Call the 'update_descriptors_column' function to update the descriptors column
    update_descriptors_column(db_file_initial, db_file_final)
    file.close()    


No SMILE found for DRUGBANK_ID DB00072
No SMILE found for DRUGBANK_ID DB00083
CN1CCC[C@H]1C2=CN=CC=C2
CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@H]3[C@H](C=C4)O
CC1=CC=CC=C1N2CCN(CC2)CCC3=NN=C4N3CCCC4
C1CCC(CC1)C(CCN2CCCC2)(C3=CC=CC=C3)O
CC1=C2COC(=O)C2=C(C(=C1OC)C/C=C(\C)/CCC(=O)OCCN3CCOCC3)O
CCOC(=O)C(C)(C)OC1=CC=C(C=C1)Cl
C[N+]1([C@H]2CC(C[C@H]1C3C2O3)OC(=O)[C@H](CO)C4=CC=CC=C4)C.[Br-]
CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@]([C@H]3[C@@H]([C@@](C2(C)C)(C[C@@H]1OC(=O)[C@@H]([C@H](C5=CC=CC=C5)NC(=O)C6=CC=CC=C6)O)O)OC(=O)C7=CC=CC=C7)(CO4)OC(=O)C)O)C)OC(=O)C
CC(C)NCC(COC1=CC=CC2=C1C=CN2)O
CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(=C4C=C3)C)C)S(=O)(=O)N
CC[C@H](C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C@H](C=C2)C)CC[C@@H]3C[C@H](CC(=O)O3)O)C
C1=C2C(=CC(=C1Cl)S(=O)(=O)N)S(=O)(=O)N=CN2
CC1=C(C(=O)N2CCCCC2=N1)CCN3CCC(CC3)C4=NOC5=C4C=CC(=C5)F
CC1=CC=CC=C1S(=O)(=O)NC(=O)C2=CC(=C(C=C2)CC3=CN(C4=C3C=C(C=C4)NC(=O)OC5CCCC5)C)OC
CCCNC(=O)NS(=O)(=O)C1=CC=C(C=C1)Cl
CN1CC[C@@]23CCCC[C@@H]2[C@@H]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


CC1=C(C(=CC=C1)[C@H](C)C2=CN=CN2)C
COC1=CC2=C(C=CN=C2C=C1)[C@H]([C@@H]3C[C@@H]4CCN3C[C@@H]4C=C)O
COC1=CC=C(C=C1)CCN2CCC(CC2)NC3=NC4=CC=CC=C4N3CC5=CC=C(C=C5)F
CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O
C[C@@H](CC1=CC=CC=C1)NC(=O)[C@H](CCCCN)N
No SMILE found for DRUGBANK_ID DB01269
CC1=CC(=NO1)C(=O)NNCC2=CC=CC=C2
CCOC(=O)C1=C(NC(=C(C1C2=C(C(=CC=C2)Cl)Cl)C(=O)OC)C)C
CCCCC(C)(C/C=C/[C@H]1[C@@H](CC(=O)[C@@H]1CCCCCCC(=O)OC)O)O
CC#CCN1C2=C(N=C1N3CCC[C@H](C3)N)N(C(=O)N(C2=O)CC4=NC5=CC=CC=C5C(=N4)C)C
CC1=CC2=C(N1)C=CC=C2OCC(CNC(C)(C)C)OC(=O)C3=CC=CC=C3
No SMILE found for DRUGBANK_ID DB00060
No SMILE found for DRUGBANK_ID DB00099
CN1C=NC2=C1C(=O)N(C(=O)N2C)C
COC1=CC(=C(C=C1)OC)C(CNC(=O)CN)O
CC[C@@]1(C[C@@H]2C[C@@](C3=C(CCN(C2)C1)C4=CC=CC=C4N3)(C5=C(C=C6C(=C5)[C@]78CCN9[C@H]7[C@@](C=CC9)([C@H]([C@@]([C@@H]8N6C)(C(=O)N)O)O)CC)OC)C(=O)OC)O
C1=CN(C=N1)CC(O)(P(=O)(O)O)P(=O)(O)O
CCC(=O)N(C1=CC=CC=C1)C2(CCN(CC2)CCC3=CC=CS3)COC
CC(C)[N+](C)(CCOC(=O)C1C2=CC=CC=C2OC3=CC=CC=C13)C(C)C
C1[C@H]([C@@H]([C@H]([C@@H](O

### calculate_molecular_descriptors

In [11]:
def calculate_molecular_descriptors(smiles):
    """
    Calculate molecular descriptors for a given SMILES representation of a molecule.

    Parameters:
        smiles (str): The SMILES representation of the molecule.

    Returns:
        dict: A dictionary containing the calculated molecular descriptors for the molecule.
    """

    # Create an RDKit Molecule object from the SMILES representation
    molecule = Chem.MolFromSmiles(smiles)

    # Create a descriptor calculator
    calc = Calculator(descriptors, ignore_3D=True)  # Optional: Ignore 3D descriptors for 2D molecules

    # Calculate the descriptors for the molecule
    descriptors_result = calc(molecule)

    return descriptors_result


### get_uniprot_ac_for_pubchem_substance

In [12]:
def get_uniprot_ac_for_pubchem_substance(drugbank_id):
    """
    Retrieve the UniProt Accession (UNIPROT_AC) for a given DrugBank ID associated with PubChem Substance.

    Parameters:
        drugbank_id (str): The DrugBank ID of the substance.

    Returns:
        str or None: The UniProt Accession (UNIPROT_AC) associated with the given DrugBank ID and PubChem Substance,
                     or None if no matching entry is found in the data.
    """

    # Read the Excel file containing drugbank cross-references
    df = pd.read_excel("drugbank_crossrefs.xlsx", engine='openpyxl')
    use_substance = False
    # Filter the data for the DRUGBANK_ID and FIELD_VALUE "PubChem Substance"
    filtered_df = df[(df["DRUGBANK_ID"] == drugbank_id) & (df["FIELD_VALUE"] == "PubChem Compound")]
    if filtered_df.empty:
        use_substance = True
        filtered_df = df[(df["DRUGBANK_ID"] == drugbank_id) & (df["FIELD_VALUE"] == "PubChem Substance")]
    # Get the value of UNIPROT_AC from the filtered data
    uniprot_ac = filtered_df["UNIPROT_AC"].iloc[0] if not filtered_df.empty else None

    return uniprot_ac,use_substance


### create_table_for_descriptor

In [13]:
def create_table_for_descriptor(cursor, drugbank_id):
    """
    Create a table to store molecular descriptors for a specific DrugBank ID.

    Parameters:
        cursor (sqlite3.Cursor): The cursor object to execute SQLite commands.
        drugbank_id (str): The DrugBank ID for which the table needs to be created.

    Returns:
        None
    """

    # Create a table for the given DrugBank ID if it does not exist yet
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS "{drugbank_id}" (
        descriptor_name TEXT PRIMARY KEY,
        descriptor_value REAL
    );
    """
    cursor.execute(create_table_query)


### update_descriptors_column

In [14]:
def update_descriptors_column(db_file_initial, db_file_final):
    """
    Update the descriptors column in the final SQLite database with molecular descriptors
    calculated for the corresponding DrugBank IDs from the initial SQLite database.

    Parameters:
        db_file_initial (str): The file path of the initial SQLite database containing DrugBank data.
        db_file_final (str): The file path of the final SQLite database to store molecular descriptors.

    Returns:
        None
    """

    # Connect to the initial SQLite database
    conn = sqlite3.connect(db_file_initial)
    cursor = conn.cursor()

    # Connect to the final SQLite database
    conn_final = sqlite3.connect(db_file_final)
    cursor_final = conn_final.cursor()

    # Get the names of all tables in the initial database
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = cursor.fetchall()

    # Iterate over all tables in the initial database
    for table in table_names:
        table_name = table[0]

        # Get the representative DRUGBANK_ID for this table
        cursor.execute(f"""SELECT DRUGBANK_ID FROM "{table_name}" LIMIT 1;""")
        drugbank_id_representative = cursor.fetchone()

        if drugbank_id_representative:
            drugbank_id_representative = drugbank_id_representative[0]

            # Get the UNIPROT_AC for the representative DRUGBANK_ID from PubChem Substance
            uniprot_ac,use_substance = get_uniprot_ac_for_pubchem_substance(drugbank_id_representative)

            if uniprot_ac:

                try:
                    if use_substance:
                        uniprot_ac = pcp.Substance.from_sid(uniprot_ac).standardized_cid
                        
                    c = pcp.Compound.from_cid(uniprot_ac)
                    
                except Exception as e:
                    file.write(f"\nNo SMILE found for DRUGBANK_ID {drugbank_id_representative}")
                    print(f"No SMILE found for DRUGBANK_ID {drugbank_id_representative}")
                else:
                    print(c.isomeric_smiles)
                    drug_descriptors = calculate_molecular_descriptors(c.isomeric_smiles)

                    # Create a table for the representative DRUGBANK_ID if it does not exist yet in the final database
                    create_table_for_descriptor(cursor_final, drugbank_id_representative)

                    # Insert or update the descriptors in the corresponding table
                    for descriptor_name, descriptor_value in drug_descriptors.items():
                        
                        # Insert or update the descriptors in the table
                        insert_query = f"""INSERT OR REPLACE INTO "{drugbank_id_representative}" (descriptor_name, descriptor_value) VALUES (?, ?);"""
                        cursor_final.execute(insert_query, (str(descriptor_name), str(descriptor_value)))
            else:
               
                file.write(f"\nNo UNIPROT_AC value found for DRUGBANK_ID {drugbank_id_representative}")
                print(f"No UNIPROT_AC value found for DRUGBANK_ID {drugbank_id_representative}.")

    # Commit and close the connections to the databases
    conn.commit()
    conn.close()
    conn_final.commit()
    conn_final.close()
