In [1]:
import pandas as pd
from rdflib import Graph, RDF, Namespace

In [2]:
g = Graph()

disease = Namespace("https://www.orpha.net/ORDO/")
phenotype = Namespace("https://purl.obolibrary.org/obo/")
relationship = Namespace("https://semanticscience.org/resource/")
compound = Namespace("https://identifiers.org/lincs.smallmolecule:")
compound_type = Namespace("http://purl.obolibrary.org/obo/")

In [3]:
g.bind('compound', compound)
g.bind('disease', disease)
g.bind('relationship', relationship)
g.bind('phenotype', phenotype)
g.bind('type',compound_type)

In [4]:
defects_map = pd.read_csv('defects_hpo_mappings.tsv', sep='\t', index_col=0)

In [19]:
l1000fwd_meta = pd.read_csv('Drugs_metadata.csv')

In [20]:
lincs_dict = dict(zip(l1000fwd_meta.pert_iname, l1000fwd_meta.LSM_id))
lincs_dict_brd_lsm = dict(zip(l1000fwd_meta.pert_id, l1000fwd_meta.LSM_id))

In [21]:
drug_alias_table = pd.read_csv('https://s3.amazonaws.com/lincs-dcic/sigcom-lincs-metadata/LINCS_small_molecules.tsv',
                              sep = '\t',
                              index_col=0)
alias_lookup =  {y:x for x, y in drug_alias_table[~(drug_alias_table['compound_aliases'] == '-')]\
                .to_dict()['compound_aliases'].items()}

In [42]:
with open('drugshot_birth_defects.gmt', 'r') as f: 
    for l in f.readlines(): 
        p = l.split('\t')[0].strip()
        hp_id = defects_map.hp.get(p)
        if hp_id:
            if hp_id.startswith('ORPHA'):
                hp_id = hp_id.replace(':', '_').replace('ORPHA', 'Orphanet')
                g.add((disease[hp_id], RDF.type, relationship['SIO_010299']))
                for drug in l.split('\t')[2:]:
                    if drug in lincs_dict:
                        lsm = lincs_dict[drug.strip()]
                        g.add((disease[hp_id], relationship['SIO_000983'], compound[lsm]))
                    elif drug in alias_lookup:
                        brd = alias_lookup[drug]
                        if brd in lincs_dict_brd_lsm:
                            lsm = lincs_dict_brd_lsm[brd]
                            g.add((disease[hp_id], relationship['SIO_000983'], compound[lsm]))
            else:
                hp_id = hp_id.replace(':', '_')
                g.add((phenotype[hp_id], RDF.type, relationship['SIO_010056']))
                for drug in l.split('\t')[2:]:
                    if drug in lincs_dict:
                        lsm = lincs_dict[drug.strip()]
                        g.add((phenotype[hp_id], relationship['SIO_000983'], compound[lsm]))
                    elif drug in alias_lookup:
                        brd = alias_lookup[drug]
                        if brd in lincs_dict_brd_lsm:
                            lsm = lincs_dict_brd_lsm[brd]
                            g.add((phenotype[hp_id], relationship['SIO_000983'], compound[lsm]))                

In [43]:
g.serialize(format="turtle", destination="Drugshot_BirthDefects_Drug_Associations.ttl")

<Graph identifier=N083cf396fb9d4bf49087c5d4918eb11e (<class 'rdflib.graph.Graph'>)>