# Properties of drugs
Find various properties of the individual drugs  
  
1.) ATC  
2.) GO Annotations  
3.) Disease   
4.) KeGG Pathways  
5.) SIDER (known effects)  
6.) Offside (known off sides)  
7.) TwoSides  
8.) Drug Properties (physico-chemical properties)  
9.) Enzymes, Transporters and Carriers  
10.) Chemical_Gentic Perturbations (MsigDB)

## 1. ATC 
Extract information about the anatomical as well as therapeutic group a drug is associated to using DrugBank as main source

In [None]:
import  networkx as nx

#The the ATC classification from drugbank (see python file: 2a_Create_DrugBank_Network.ipynb)
DrugBankInfo = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')
print 'DrugBank Network loaded'

#Create output file
fp_out = open('../results/Drug_Properties/CLOUD_to_ATC.csv','w')
fp_out.write('CLOUD,DrugBankID,First_Level_ATCs,Second_Level_ATCs\n')

#Dictionary containing DrugBank to CLOUD identifier
DrugBank_to_CLOUD = {}
#parse through all CLOUD drugs and check for ATC code annotation in drugbank (Use first and second level; third level and below too specific)
fp = open('../data/Drug_Properties/CLOUD_DrugBank_PubChem_Chembl.csv','r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    DrugBank_to_CLOUD[tmp[1]] = tmp[0]
    first_level = set()
    fist_second_level = set()
    if DrugBankInfo.has_node(tmp[1]):
        if DrugBankInfo.node[tmp[1]].has_key('ATCcode'):
            atc_codes =  DrugBankInfo.node[tmp[1]]['ATCcode'].split(',')
            if '' in atc_codes:
                atc_codes.remove('')

            for atc in atc_codes:
                atc = atc.strip()
                first_level.add(atc[0])
                fist_second_level.add(atc[0:3])

    fp_out.write(tmp[0]+','+tmp[1]+','+';'.join(first_level)+','+';'.join(fist_second_level)+'\n')

fp.close()
fp_out.close()

print 'Finished ATC annotations'

## 2. GO Annotations
Extract GO annotations from GeneOntology for the targets of the individual drugs. Not only leaf but also upstream term information is collected for the three branches (i) Function, (ii) Component, (iii) Process

In [None]:
#use our inhouse database and the corresponding python file to create the upward ontology for every leaf GO term (all get included)
#Download (http://www.geneontology.org/page/downloads)
import gene2terms_addupstream as GO

#Include all threee GO branches
go_branches = ['Function','Process','Component']

#Find all the targets for the individual cloud drugs
cloud_targets = {}
fp = open('../data/Drug_Properties/CLOUD_All_Targets.csv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets[tmp[0]] = tmp[2].split(';')
fp.close()

#contain all CLOUD identifier
all_clouds = cloud_targets.keys()
all_clouds.sort()

#Go throug the GO branches and find GO terms for a specific drug via: Drug --> Targets --> Associated GO-Terms
drug_to_GO = {}
for go_branch in go_branches:
    print go_branch
    drug_to_GO[go_branch] = {}
    GO_Association_UP, GO_genes_annotation = GO.getAllGene_Annotation(go_branch)
      
    for drug in all_clouds:
        drug_to_GO[go_branch][drug] = []
        for target in cloud_targets[drug]:
            drug_to_GO[go_branch][drug].extend(GO_Association_UP[target])
        drug_to_GO[go_branch][drug] = list(set(drug_to_GO[go_branch][drug]))
        
#Save CLOUD drug to GO term annotations
fp_out = open('../results/Drug_Properties/CLOUD_to_GOterms.csv','w')
fp_out.write('CLOUD,GO_Function,GO_Process,GO_Component\n')
for cloud in all_clouds:
    fp_out.write(cloud+','+';'.join(drug_to_GO['Function'][cloud])+','+';'.join(drug_to_GO['Process'][cloud])+','+';'.join(drug_to_GO['Component'][cloud])+'\n')
fp_out.close()

print 'Finished GO'

## 3. Diseases
Extract Disesase annotations from DiseaseOntology for the targets of the individual drugs. Not only leaf but also upstream term information is collected.

In [None]:
# Download from http://www.disgenet.org/web/DisGeNET/menu/downloads and http://disease-ontology.org/downloads/
# Again use inhouse database (manually curated), and corresponding scripts 

# Get all cloud drug targets
fp = open('../data/Drug_Properties/CLOUD_All_Targets.csv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets[tmp[0]] = tmp[2].split(';')
fp.close()

all_clouds = cloud_targets.keys()
all_clouds.sort()

#Extrate the upward disease ontology (find all disease associated leaf plus upwards ontology terms for a specific gene)
Disease_Association_UP,d_diseases_annotation = GO.getAllGene_Disease_Annotation()


all_proteins = Disease_Association_UP.keys()
all_proteins  = [int(x) for x in all_proteins]
all_proteins.sort()

fp_out = open('../results/Drug_Properties/Gene_to_Disease.csv','w')
fp_out.write('Gene,Disease_ID\n')
for protein in all_proteins:
    fp_out.write(str(protein)+','+';'.join(Disease_Association_UP[str(protein)])+'\n')
fp_out.close()



break


#associated drug with diseaes
drug_to_Diseases = {}
for drug in all_clouds:
        drug_to_Diseases[drug] = []
        for target in cloud_targets[drug]:
            drug_to_Diseases[drug].extend(Disease_Association_UP[target])
        drug_to_Diseases[drug] = list(set(drug_to_Diseases[drug]))
        


fp_out = open('../results/Drug_Properties/CLOUD_to_Disease.csv','w')
fp_out.write('CLOUD,Disease_ID\n')
for cloud in all_clouds:
    fp_out.write(cloud+','+';'.join(drug_to_Diseases[cloud])+'\n')
fp_out.close()

print 'Finished Diseases'

## 4. KeGG Pathways
Extract information about pathways being annotated to (i) the drug itself, as well as (ii) pathways associated to the target of drugs

In [None]:
'''
Extract direct drug <--> pathway annotations
'''

#Get KeGG pathways via the biopython.KEGG REST 
from Bio.KEGG import REST

#Find the KeGG identifiers via the drugbank annotations
DrugBankInfo = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')
print 'DrugBank Network loaded'

#parse through all CLOUD targets
fp = open('../data/Drug_Properties/CLOUD_DrugBank_PubChem_Chembl.csv','r')
fp.next()
drug_to_pathways = {}
all_targeted_Pathways = set()
all_clouds = []
kegg_IDs = {}

#find the KeGG Drug page and find PATHWAY informations (direct drug to pathway)
for line in fp:
    tmp = line.strip().split(',')
    
    drug_to_pathways[tmp[0]] = []
    
    all_clouds.append(tmp[0])
    
    if DrugBankInfo.has_node(tmp[1]):
        if DrugBankInfo.node[tmp[1]].has_key('KEGGDrug'):
            kegg_ID = DrugBankInfo.node[tmp[1]]['KEGGDrug']
            kegg_IDs[tmp[0]] = kegg_ID
            drug_file = REST.kegg_get(kegg_ID).read()

            for line in drug_file.rstrip().split("\n"):
                section = line[:12].strip()  # section names are within 12 columns
                if not section == "":
                    current_section = section
                if current_section == "PATHWAY":
                    tmp2 =  line[12:].split('  ')
                    pathwayID = tmp2[0].split('(')[0]
                    drug_to_pathways[tmp[0]].append(pathwayID)
                    all_targeted_Pathways.add(pathwayID)
                    
print 'Number of pathways directed targeted: %d' %len(all_targeted_Pathways)

all_clouds.sort()


In [None]:
'''
Additonally to finding the direct annotations, also find drug <--> targets <--> pathways associated to those target annotations
'''

#Get all targets
cloud_targets = {}
fp = open('../data/Drug_Properties/CLOUD_All_Targets.csv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets[tmp[0]] = tmp[2].split(';')
fp.close()

# find human pahtways
human_pathways = REST.kegg_list("pathway", "hsa").read()

# get all human pathways, and add the dictionary
pathways = {}
for line in human_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
    pathways[entry] =  {'Description' :description, 'IDs':None,'Symbols':None}


print len(pathways)
# Get the genes for pathways and add them to a list

for pathway in pathways.keys():
    pathway_file = REST.kegg_get(pathway).read()  # query and read each pathway

    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None

    genesSymbols =  []
    genesIDs = []
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            if ';' in line:
                gene_identifiers, gene_description = line[12:].split("; ")
                gene_id, gene_symbol = gene_identifiers.split()

                if not gene_id in genesIDs:
                    genesIDs.append(gene_id)
                    genesSymbols.append(gene_symbol)

    pathways[pathway] = genesIDs

via_target_assigned_Pathways = {}
second_assigned_pathways = set()
for cloud in all_clouds:
    via_target_assigned_Pathways[cloud] = [] 
    targets = cloud_targets[cloud]
    for p in pathways:
        if len(set(targets).intersection(set(pathways[p]))) > 0:
            via_target_assigned_Pathways[cloud].append(p)
            second_assigned_pathways.add(p)
            
print 'Number of pathways indirected targeted: %d' %len(second_assigned_pathways)

fp_out = open('../results/Drug_Properties/CLOUD_to_KeGG_Pathways.csv','w')
fp_out.write('CLOUD,KeGG_DrugID,KeGG_Assigned_Pathways,Via_Target_Assigned\n')
for cloud in all_clouds:
    if kegg_IDs.has_key(cloud):
        fp_out.write(cloud+','+kegg_IDs[cloud]+','+';'.join(drug_to_pathways[cloud])+','+';'.join(via_target_assigned_Pathways[cloud])+'\n')
    else:
        fp_out.write(cloud+',,'+';'.join(drug_to_pathways[cloud])+','+';'.join(via_target_assigned_Pathways[cloud])+'\n')
        
fp_out.close()

print 'Finished Pathways'        

## 5. SIDER
Extract information about known adverse reaction of drugs using the Sider database

In [None]:
def ATC_To_PubChem(isOffsides = 'None'):
    '''
    Sider offerst a direct conversion from ATC code to the internally used PubChem ID.
    Offers a better coverage. 
    
    Download: http://sideeffects.embl.de/download/ [Nov. 2018] drug_atc.tsv file
    (here named: Pubchem_To_ATC)
    '''

    dic_ATc_To_Pubchem = {}
    fp = open('../data/Drug_Properties/Pubchem_To_ATC.tsv')
    for line in fp:
        tmp =  line.strip().split('\t')
        dic_ATc_To_Pubchem[tmp[1]] = tmp[0]

    cloud_drugs = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')
    
    
    #find pubchem identifiers via ATC identifiers (as pubchem identifiers sometimes not unique neithers SID nor CID)
    cloud_to_Pubchem = {}
    PubChem_to_cloud = {}
    found_PubChems = []
    for drugBankID in cloud_drugs.nodes():
        if cloud_drugs.node[drugBankID].has_key('ATCcode'):
            all_codes = [x.strip() for x in cloud_drugs.node[drugBankID]['ATCcode'].split(',') if x != '']
            for code in all_codes:
                if dic_ATc_To_Pubchem.has_key(code):
                    pubChemID = dic_ATc_To_Pubchem[code][3:]
                    if isOffsides == 'offsides':
                        tmp = list(pubChemID)
                        tmp[0] = '0'
                        pubChemID = ''.join(tmp)

                    cloud_to_Pubchem[drugBankID] = pubChemID
                    PubChem_to_cloud[pubChemID] = drugBankID
                    found_PubChems.append(pubChemID)


    return cloud_to_Pubchem, PubChem_to_cloud,found_PubChems


In [None]:
'''
Download SIDER.tsv from http://sideeffects.embl.de/download/ [Nov. 2018] 
'''

#get the different identifiers of a drug
DrugBank_To_CLOUD = {}
CLOUD_To_DrugBank = {}
fp = open('../data/Drug_Properties/CLOUD_DrugBank_PubChem_Chembl.csv')
fp.next()
all_clouds = []
for line in fp:
    tmp = line.strip().split(',')
    all_clouds.append(tmp[0])
    DrugBank_To_CLOUD[tmp[1]] = tmp[0]
    CLOUD_To_DrugBank[tmp[0]] = tmp[1]
fp.close()

all_clouds.sort()

#extract pubchem identifier via ATC codes
DrugBank_to_Pubchem_viaATC, PubChem_to_cloud_viaATC,found_PubChems_viaATC = ATC_To_PubChem()

#further use drugbank to find additional pubchem identifiers for the cloud drugs
cloud_drugs = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')

#associate cloud with the different pubchem identifiers
pubchemCompound_To_DrugBank = {}
DrugBank_to_PubChem = {}
pubchemCompound = []
pubchemSubstance = []
for node in cloud_drugs.nodes():
    if cloud_drugs.node[node].has_key('PubChemCompound'):
        pubchemCompound.append(cloud_drugs.node[node]['PubChemCompound'])
        pubchemCompound_To_DrugBank[cloud_drugs.node[node]['PubChemCompound']] = node
        DrugBank_to_PubChem[node] = cloud_drugs.node[node]['PubChemCompound']

#Combine both dictionaries together
for key in DrugBank_to_Pubchem_viaATC:
    DrugBank_to_PubChem[key] = DrugBank_to_Pubchem_viaATC[key]



#check the SIDER database for given sideeffect of a given drug (once via the ATC to pubchem identfiers; once via drugbank to pubchem)
compund_sideEffect = {}
fp = open('../data/Drug_Properties/SIDER.tsv','r')
for line in fp:
    tmp = line.strip().split('\t')
    id1 = tmp[1][3:]
    id2 = tmp[2][3:]

    if id1 in found_PubChems_viaATC:
        if compund_sideEffect.has_key(PubChem_to_cloud_viaATC[id1]):
            compund_sideEffect[PubChem_to_cloud_viaATC[id1]].append(tmp[3])
        else:
            compund_sideEffect[PubChem_to_cloud_viaATC[id1]] = [tmp[3]]

    if id1 in pubchemCompound:
        if compund_sideEffect.has_key(pubchemCompound_To_DrugBank[id1]):
            compund_sideEffect[pubchemCompound_To_DrugBank[id1]].append(tmp[3])
        else:
            compund_sideEffect[pubchemCompound_To_DrugBank[id1]] = [tmp[3]]



    if id2 in found_PubChems_viaATC:
        if compund_sideEffect.has_key(PubChem_to_cloud_viaATC[id2]):
            compund_sideEffect[PubChem_to_cloud_viaATC[id2]].append(tmp[3])
        else:
            compund_sideEffect[PubChem_to_cloud_viaATC[id2]] = [tmp[3]]

    if id2 in pubchemCompound:
        if compund_sideEffect.has_key(pubchemCompound_To_DrugBank[id2]):
            compund_sideEffect[pubchemCompound_To_DrugBank[id2]].append(tmp[3])
        else:
            compund_sideEffect[pubchemCompound_To_DrugBank[id2]] = [tmp[3]]

##
# Save results
##


fp = open('../results/Drug_Properties/CLOUD_to_SIDER.csv','w')
fp.write('CLOUD,PubChem,SIDER_Ids\n')
for key in all_clouds:
    if compund_sideEffect.has_key(CLOUD_To_DrugBank[key]):
        fp.write(key +','+DrugBank_to_PubChem[CLOUD_To_DrugBank[key]]+','+';'.join(list(set(compund_sideEffect[CLOUD_To_DrugBank[key]])))+'\n')
    elif DrugBank_to_PubChem.has_key(CLOUD_To_DrugBank[key]):
        fp.write(key  +','+DrugBank_to_PubChem[CLOUD_To_DrugBank[key]]+',' + '\n')
    else:
        fp.write(key + ',,\n')
fp.close()

print 'Finish with SIDER'

## 6. Offsides
Extract information about known adverse reaction of drugs using the Offside database (Tantonetti)

In [None]:
'''
Download Offsides.tsv from http://tatonettilab.org/resources/tatonetti-stm.html [Nov. 2018] 
'''
#get the different identifiers of a drug
DrugBank_To_CLOUD = {}
CLOUD_To_DrugBank = {}
fp = open('../data/Drug_Properties/CLOUD_DrugBank_PubChem_Chembl.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    DrugBank_To_CLOUD[tmp[1]] = tmp[0]
    CLOUD_To_DrugBank[tmp[0]] = tmp[1]
fp.close()

#extract pubchem identifier via ATC codes
DrugBank_to_Pubchem_viaATC, PubChem_to_cloud_viaATC, found_PubChems_viaATC = ATC_To_PubChem('offsides')

#further use drugbank to find additional pubchem identifiers for the cloud drugs
cloud_drugs = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')

#associate cloud with the different pubchem identifiers
pubchemCompound_To_DrugBank = {}
DrugBank_to_PubChem = {}
pubchemCompound = []
pubchemSubstance = []
for node in cloud_drugs.nodes():
    if cloud_drugs.node[node].has_key('PubChemCompound'):
        pubchemCompound.append(cloud_drugs.node[node]['PubChemCompound'].zfill(9))
        pubchemCompound_To_DrugBank[cloud_drugs.node[node]['PubChemCompound'].zfill(9)] = node
        DrugBank_to_PubChem[node] = cloud_drugs.node[node]['PubChemCompound'].zfill(9)

# Combine both dictionaries together
for key in DrugBank_to_Pubchem_viaATC:
    DrugBank_to_PubChem[key] = DrugBank_to_Pubchem_viaATC[key]


#check the OFFSIDES database for given sideeffect of a given drug (once via the ATC to pubchem identfiers; once via drugbank to pubchem)
compund_sideEffect = {}
fp = open('../data/Drug_Properties/Offsides.tsv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split('\t')


    id1 = tmp[0].replace('"','')[3:]
    sideEffect = tmp[2].replace('"','')

    #print id1

    if id1 in found_PubChems_viaATC:
        if compund_sideEffect.has_key(PubChem_to_cloud_viaATC[id1]):
            compund_sideEffect[PubChem_to_cloud_viaATC[id1]].append(sideEffect)
        else:
            compund_sideEffect[PubChem_to_cloud_viaATC[id1]] = [sideEffect]
            print len(compund_sideEffect.keys())
            # print compund_sideEffect.keys()

    if id1 in pubchemCompound:
        if compund_sideEffect.has_key(pubchemCompound_To_DrugBank[id1]):
            compund_sideEffect[pubchemCompound_To_DrugBank[id1]].append(sideEffect)
        else:
            compund_sideEffect[pubchemCompound_To_DrugBank[id1]] = [sideEffect]
            print len(compund_sideEffect.keys())
            # print compund_sideEffect.keys()

fp = open('../results/Drug_Properties/CLOUD_to_Offsides.csv', 'w')
fp.write('CLOUD,PubChem,OFFSIDE_Ids\n')
for key in all_clouds:
    if compund_sideEffect.has_key(CLOUD_To_DrugBank[key]):
        fp.write(key  +','+DrugBank_to_PubChem[CLOUD_To_DrugBank[key]]+','+';'.join(list(set(compund_sideEffect[CLOUD_To_DrugBank[key]])))+'\n')
    elif DrugBank_to_PubChem.has_key(CLOUD_To_DrugBank[key]):
        fp.write(key + ',' +DrugBank_to_PubChem[CLOUD_To_DrugBank[key]]+',' + '\n')
    else:
        fp.write(key + ',,\n')
fp.close()

print 'Finish with OFFSIDES'

## 7. TwoSides
Extract information about side effects for drug combinations using TwoSide (Tantonetti))

In [None]:
'''
Download Offsides.tsv from http://tatonettilab.org/resources/tatonetti-stm.html [Nov. 2018] 
'''
#get the different identifiers of a drug
DrugBank_To_CLOUD = {}
CLOUD_To_DrugBank = {}
fp = open('../data/Drug_Properties/CLOUD_DrugBank_PubChem_Chembl.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    DrugBank_To_CLOUD[tmp[1]] = tmp[0]
    CLOUD_To_DrugBank[tmp[0]] = tmp[1]
fp.close()

#extract pubchem identifier via ATC codes
DrugBank_to_Pubchem_viaATC, PubChem_to_cloud_viaATC, found_PubChems_viaATC = ATC_To_PubChem('offsides')

#further use drugbank to find additional pubchem identifiers for the cloud drugs
cloud_drugs = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')

pubchemCompound_To_DrugBank = {}
DrugBank_to_PubChem = {}
pubchemCompound = []
pubchemSubstance = []
for node in cloud_drugs.nodes():
    if cloud_drugs.node[node].has_key('PubChemCompound'):
        pubchemCompound.append(cloud_drugs.node[node]['PubChemCompound'].zfill(9))
        pubchemCompound_To_DrugBank[cloud_drugs.node[node]['PubChemCompound'].zfill(9)] = node
        DrugBank_to_PubChem[node] = cloud_drugs.node[node]['PubChemCompound'].zfill(9)

# Combine both dictionaries together
for key in DrugBank_to_Pubchem_viaATC:
    DrugBank_to_PubChem[key] = DrugBank_to_Pubchem_viaATC[key]
    
    
#check the SIDER database for given sideeffect of a given drug (once via the ATC to pubchem identfiers; once via drugbank to pubchem)
TwoSide_Network = nx.Graph()
fp = open('../data/Drug_Properties/TwoSides.tsv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split('\t')


    id1 = tmp[0][3:]
    id2 = tmp[1][3:]
    sideEffect = tmp[4]

    #print id1
    found_id1 = None
    found_id2 = None
    
    if id1 in found_PubChems_viaATC: 
        found_id1 = PubChem_to_cloud_viaATC[id1]
    elif id1 in pubchemCompound: 
        found_id1 = pubchemCompound_To_DrugBank[id1]
        
    if found_id1 != None:
        if id2 in found_PubChems_viaATC: 
            found_id2 = PubChem_to_cloud_viaATC[id2]
        elif id2 in pubchemCompound: 
            found_id2 = pubchemCompound_To_DrugBank[id2]
        
        
        if found_id2 != None:
            if TwoSide_Network.has_edge(found_id1,found_id2) == False:
                TwoSide_Network.add_edge(found_id1,found_id2)
                TwoSide_Network[found_id1][found_id2]['SideEffect'] = sideEffect
            else:
                 TwoSide_Network[found_id1][found_id2]['SideEffect'] =  TwoSide_Network[found_id1][found_id2]['SideEffect']  +',' + sideEffect
        
        
nx.write_gml(TwoSide_Network,'../results/Drug_Properties/TwoSide_CLOUDs.gml')

print 'Finish with TwoSides'


## 8. Drug Properties
Extract Physicochemical properties of the drugs e.g. Lipinski Rule of 5, LogS, LogP etc. Use DrugBank as main source of information

In [None]:
'''
Physicochemical properties (calculated) offered by DrugBank
'''

#List of interesting physicochemical properties (continues)
Continuesfeatures = ['Polarizability','logS','logP','NumberofRings','PhysiologicalCharge',
            'PolarSurfaceAreaPSA','pKastrongestbasic','pKastrongestacidic',
            'Refractivity','MonoisotopicWeight','HBondDonorCount',
            'RotatableBondCount','WaterSolubility']

##List of interesting physicochemical properties (discrete)
discreteFeatures = ['DrugSubClass','DrugClass','Family']

#Drugbank file
DrugBankInfo = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03_CLOUD_Only.gml')
print 'DrugBank Network loaded'

#output file
fp = open('../data/Drug_Properties/CLOUD_DrugBank_PubChem_Chembl.csv','r')
fp.next()


#parse through all cloud drugs and find physicochemical propterties
CLOUD_Chemical_properties = {}
all_clouds = []
kegg_IDs = {}
for line in fp:
    tmp = line.strip().split(',')
    
    all_clouds.append(tmp[0])
    CLOUD_Chemical_properties[tmp[0]] = {}
    
    
    if DrugBankInfo.has_node(tmp[1]):
        CLOUD_Chemical_properties[tmp[0]]['DrugBankID'] = tmp[1]
        
        for c in Continuesfeatures:
            if  DrugBankInfo.node[tmp[1]].has_key(c):
                CLOUD_Chemical_properties[tmp[0]][c] = str(DrugBankInfo.node[tmp[1]][c])
            else:
                CLOUD_Chemical_properties[tmp[0]][c] = 'None'
                
        
        for d in discreteFeatures:
            if  DrugBankInfo.node[tmp[1]].has_key(d):
                CLOUD_Chemical_properties[tmp[0]][d] = str(DrugBankInfo.node[tmp[1]][d])
            else:
                CLOUD_Chemical_properties[tmp[0]][d] = 'None'
        
    else:
        CLOUD_Chemical_properties[tmp[0]]['DrugBankID'] = 'None'
        
        for c in Continuesfeatures:
            CLOUD_Chemical_properties[tmp[0]][c] = 'None'
            
        for d in discreteFeatures:
            CLOUD_Chemical_properties[tmp[0]][d] = 'None'
            
##
# Save results
##
        
fp = open('../results/Drug_Properties/CLOUD_to_ChemicalProperties.tsv', 'w')
fp.write('CLOUD\tDrugBankID\t')
fp.write('\t'.join(Continuesfeatures)+'\t'+'\t'.join(discreteFeatures)+'\n')        
         
for cloud in all_clouds:
        fp.write(cloud+'\t'+CLOUD_Chemical_properties[cloud]['DrugBankID'])
        for c in Continuesfeatures:
             fp.write('\t'+CLOUD_Chemical_properties[cloud][c])
        for d in discreteFeatures:
             fp.write('\t'+CLOUD_Chemical_properties[cloud][d])
        fp.write('\n')
fp.close()

print 'Finish with Chemical Properties'


## 9. Targets, Enzymes, Transporters and Carriers
Split the full lust of targets into targets, enzymes, transporters and carriers
Therefore use the DrugBank annotations of what a target, transporter, carrier and enzyme is. Go trough all drugbank targets and take the corresponding annotations.
Then go trough the CLOUD targets and assign the targets accordingly. If drugbank does not show any annotation the gene is assumed to be a target.

Enzymes: e.g. CYP3A1  
Transporter: e.g. MDR5  
Carriers: e.g. ALB

In [None]:
DrugBankInfo = nx.read_gml('../data/Drug_Properties/Drugbank_2018-07-03.gml')
print 'Full DrugBank Network loaded'

In [None]:
annotated_enzyme_symbols = set()
annotated_transporters_symbols = set()
annotated_carriers_symbols = set()

#Go through all drugs in drugbank and extract target information; bin it correctly into one of the three classes
for drug in list(DrugBankInfo.nodes()):
    
    if DrugBankInfo.node[drug].has_key('Enzymes'):
        enzymes = [x for x in DrugBankInfo.node[drug]['Enzymes'].strip().split(',') if x != '']
        for e in enzymes:
            annotated_enzyme_symbols.add(e.split('_')[0])
    if DrugBankInfo.node[drug].has_key('Transporters'):
        transporters = [x for x in DrugBankInfo.node[drug]['Transporters'].strip().split(',') if x != '']
        for t in transporters:
            annotated_transporters_symbols.add(t.split('_')[0])
        
    if DrugBankInfo.node[drug].has_key('Carriers'):
        carriers = [x for x in DrugBankInfo.node[drug]['Carriers'].strip().split(',') if x != '']
        for c in carriers:
            annotated_carriers_symbols.add(c.split('_')[0])

#Plot the number of found Enzymes, Transporters, Carriers
print len(annotated_enzyme_symbols)
print len(annotated_transporters_symbols)
print len(annotated_carriers_symbols)

In [None]:
'''
Parse the enzyme, carriers and transporter SYMBOLS to EntrezIDs using mygeneinfo
'''

import mygene
mg = mygene.MyGeneInfo()

#Enzymes
query = mg.querymany(annotated_enzyme_symbols, scope='symbol', species='human',verbose=False)
final_annotated_enzyme_symbols = []
final_annotated_enzyme_IDs = []
for result in query:
    if result.has_key('entrezgene'):
        final_annotated_enzyme_symbols.append(result['symbol'])
        final_annotated_enzyme_IDs.append(str(result['_id']))

#Transporters
query = mg.querymany(annotated_transporters_symbols, scope='symbol', species='human',verbose=False)
final_annotated_transporters_symbols = []
final_annotated_transporters_IDs = []
for result in query:
    if result.has_key('entrezgene'):
        final_annotated_transporters_symbols.append(result['symbol'])
        final_annotated_transporters_IDs.append(str(result['_id']))

#Carriers
query = mg.querymany(annotated_carriers_symbols, scope='symbol', species='human',verbose=False)
final_annotated_carriers_symbols = []
final_annotated_carriers_IDs = []
for result in query:
    if result.has_key('entrezgene'):
        final_annotated_carriers_symbols.append(result['symbol'])
        final_annotated_carriers_IDs.append(str(result['_id']))
        

print len(final_annotated_enzyme_IDs)
print len(final_annotated_transporters_IDs)
print len(final_annotated_carriers_IDs)
        
        

In [None]:
'''
Create an output file with the various transporters/enzymes/targets etc. being split.
'''

#Get the DrugBank targets
cloud_DrugBanktargets = {}
fp = open('../data/Drug_Properties/CLOUD_DrugBank_Targets_ONLY.csv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_DrugBanktargets[tmp[0]] = tmp[2].split(';')
fp.close()

#Get all targets accociated to the individual CLOUDS (including CYP etc.)
cloud_targets = {}
fp = open('../data/Drug_Properties/CLOUD_All_Targets.csv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets[tmp[0]] = tmp[2].split(';')
fp.close()

#List containing all CLOUD identifiers
all_clouds = cloud_targets.keys()
all_clouds.sort()

#Create output file
fp_out = open('../results/Drug_Properties/CLOUD_to_TargetsSplit.csv', 'w')
fp_out.write('CLOUD,Targets,Transporters,Enzymes,Carriers\n')

#save the per drug annotations of CLOUD drugs
targets_number = []
enzymes_number = []
transporters_number = []
carriers_number = []

#save total amount of distinct targets, enzymes etc. targeted by CLOUD
different_targets = set()
different_enzymes = set()
different_transporters = set()
different_carriers = set()

#save total amount of targets found
all_targets = 0

#Go through all CLOUDS
for cloud in all_clouds:
    
    targets = []
    enzymes = []
    carriers = []
    transporters = []
    
    for target in cloud_targets[cloud]:
    
    
        #First check if the target is annoated in DrugBank to be a target of this drug! (sometimes CYP or other can be main targets)
        if target in cloud_DrugBanktargets[cloud]:
            targets.append(target)
        else:
            #If it is not the main target of this drug bin it correctly according to drugbank standards
            not_associated = False
            if target in final_annotated_enzyme_IDs:
                enzymes.append(target)
                not_associated = True
            if target in final_annotated_transporters_IDs:
                transporters.append(target)
                not_associated = True
            if target in final_annotated_carriers_IDs:
                carriers.append(target)
                not_associated = True

            if not_associated == False:
                targets.append(target)
    fp_out.write(cloud+','+';'.join(targets)+','+';'.join(transporters)+','+';'.join(enzymes)+','+';'.join(carriers)+'\n')
    
    #Save the results
    all_targets += len(targets)
    targets_number.append(len(targets))
    enzymes_number.append(len(enzymes))
    transporters_number.append(len(transporters))
    carriers_number.append(len(carriers))
    
    different_targets = different_targets.union(set(targets))
    different_enzymes = different_enzymes.union(set(enzymes))
    different_transporters = different_transporters.union(set(transporters))
    different_carriers = different_carriers.union(set(carriers))
    
    
    
fp_out.close()



In [None]:
'''
CREATE OUTPUT OVERVIEW OVER DRUG TARGETS/ANNOTATIONS
'''


import numpy as np
from matplotlib import pylab as plt


print'Mean number of targets: %.2f' %np.mean(targets_number)
print'Median number of targets: %.2f' %np.median(targets_number)
print'Mean number of enzymes: %.2f' %np.mean(enzymes_number)
print'Mean number of carriers: %.2f' %np.mean(carriers_number)
print'Mean number of transporters: %.2f' %np.mean(transporters_number)

print 'Total number of targets: %d' %all_targets
print 'Number of distinct targets: %d' %len(different_targets)
print'Number of distinct  enzymes: %d' %len(different_enzymes)
print'Number of distinct  carriers: %d' %len(different_carriers)
print'Number of distinct  transporters: %d' %len(different_transporters)


plt.hist(targets_number,bins=22, color='#40B9D4')
plt.axvline(np.mean(targets_number),ls='--', color='grey')
plt.savefig('../results/Drug_Properties/CLOUD_TargetsFiltered.pdf')
plt.close()

## 10. Chemical Genetic perturbations
Use the msigDB Chemical_Genetic_Perturbations set to annotate the CLOUD target respetively

In [None]:
'''
Download from http://software.broadinstitute.org/gsea/msigdb/collections.jsp#C5 [December 17. 2018]
'''

#Get all CLOUD targets
cloud_targets = {}
fp = open('../data/Drug_Properties/CLOUD_All_Targets.csv', 'r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets[tmp[0]] = tmp[2].split(';')
fp.close()

#Find the gene to perturbation associated (one gene can have various associated perturbations)
fp = open('../data/Drug_Properties/Msig_ChemGen_Perturbation.gmt','r')
gene_to_perturbation = {}
for line in fp:
    tmp = line.strip().split('\t')
    for gene in tmp[2:]:
        if gene_to_perturbation.has_key(gene):
            gene_to_perturbation[gene].append(tmp[0])
        else:
            gene_to_perturbation[gene] = [tmp[0]]
fp.close()


#find cloud associations via CLOUD --> Targets ===> Perturbations associated with certain targets
fp_out = open('../results/Drug_Properties/CLOUD_to_Perturbations.csv', 'w')
fp_out.write('CLOUD,Perturbations\n')
for cloud in all_clouds:

    perturbations = []
    for gene in cloud_targets[cloud]:
        if gene_to_perturbation.has_key(gene):
            perturbations.extend(gene_to_perturbation[gene])
    fp_out.write(cloud+','+';'.join(perturbations)+'\n')
fp_out.close()