# Create the CLOUD annotation file
### Drug Pairs
Create the annotation file for the ca. 35k CLOUD pairs e.g. if two drugs have overlapping transporters, PPI distances etc.  
Feature types and data sources:  

1.) Chemical Similarity  
2.) PPI Distances (various metrics)  
3.) MsigDB  
4.) KeGG  
5.) Sider, Offsides and TwoSides (Side effects)  
6.) ATC  
7.) DrugBank (i.e. transporters, enzymes, carriers)   
8.) GO  
9.) Disease
10.) Create final file

--------------

### Single CLOUD drugs
1.) Targets (i.e. num targets)  
2.) PPI  
3.) MsigDB  
4.) KeGG  
5.) GO  
6.) Sideeffects  
7.) ATC  
8.) Drugbank (i.e. transporters, enzymes, carriers)  
9.) Disease  
10.) Create final file


In [1]:
import os
import networkx as nx
import gene2terms_addupstream as GO
import numpy as np

In [2]:
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_All_Targets.csv','r')
fp.next()
all_Clouds = set()
CLOUD_Targets = {}
for line in fp:
    tmp = line.strip().split(',')
    all_Clouds.add(tmp[0])
    #CLOUD_Targets[tmp[0]] = tmp[2].split(';')
fp.close()    
all_Clouds = list(all_Clouds)
all_Clouds.sort()
print 'Number of drug pairs: %d' %(len(all_Clouds)*(len(all_Clouds)-1)/2)



DrugBank_To_CLOUD = {}
CLOUD_To_DrugBank = {}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_DrugBank_PubChem_Chembl.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    DrugBank_To_CLOUD[tmp[1]] = tmp[0]
    CLOUD_To_DrugBank[tmp[0]] = tmp[1]
fp.close()



#####
## Standard CLOUD targets = Combined Filtered
CLOUD_Targets = {}
fp = open('../data/Create_CLOUD_Pair_Annotation/TargetSets/CLOUD_to_TargetsSplit.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    CLOUD_Targets[tmp[0]] = [x for x in tmp[1].split(';') if x != '']

Number of drug pairs: 35511


### 1. Chemical Similarity

In [3]:
#Load previously computed chemical similarities
Chemical_Similarity = {}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/PairwiseChemicalSimilarity.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    Chemical_Similarity[tmp[0]+','+tmp[1]] = float(tmp[2])

#Put into final dictionary, ad 'nan' if not possible chemical similarity e.g. for a biologic
chemSim_final = {}
for d1 in all_Clouds:
    for d2 in all_Clouds:
        if d1 < d2:
            if Chemical_Similarity.has_key(d1+','+d2):
                chemSim_final[d1+','+d2] = Chemical_Similarity[d1+','+d2]
            elif Chemical_Similarity.has_key(d2+','+d1):
                chemSim_final[d1+','+d2] = Chemical_Similarity[d2+','+d1]
            else:
                chemSim_final[d1+','+d2] = 'nan'
print 'Finished Loading Chemical Similarity'

Finished Loading Chemical Similarity


### 2. PPI Distances
Metrics: D_AB, Mean_AB and S_AB  
TargetSet: DrugBank, PubChem, Chembl, Combined (filtered, non-filtered)

In [4]:
targetLists = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/PPI_Distances/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/PPI_Distances/', f)) and '.csv' in f]
distance_metric = {'D_AB':4, 'S_AB':5, 'Min_AB':6, 'Mean_AB':7}

#PPI_Final contains 3 metrics and 8 targets sets (3 sources, 1 combined and all 4 once filtered once complete) ==> 24 distances
PPI_final = {}

for metric in distance_metric.keys():
    PPI_final[metric] = {}
    for targetList in targetLists:

        #Go through all drug pairs and add the corresponding value to the result dictionary
        fp = open('../results/CheckBestTargetSet/' + targetList,'r')
        fp.next()
        drugpairs = {}
        for line in fp:
            tmp = line.strip().split(',')
            value = tmp[distance_metric[metric]]
            
            if value == "None":
                value = 'nan'
            
            drugpairs[tmp[0]+','+tmp[1]] = value
            drugpairs[tmp[1]+','+tmp[0]] = value

        PPI_final[metric][targetList.split('.')[0]] = drugpairs
print 'Finished Loading PPI distances'



Finished Loading PPI distances


### 3. MsigDB

In [5]:
msigDB_Final = {}

msigDBFiles = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/MSigDB/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/MSigDB/', f)) and '.gmt' in f]

for file in msigDBFiles:
    print file
    dataname = file.split('.')[0]
    

    
    #Find the gene to perturbation associated (one gene can have various associated perturbations)
    fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/MSigDB/' + file,'r')
    gene_to_association = {}
    for line in fp:
        tmp = line.strip().split('\t')

        for gene in [x for x in tmp[2:] if x != '']:
            if gene_to_association.has_key(gene):
                gene_to_association[gene].append(tmp[0])
            else:
                gene_to_association[gene] = [tmp[0]]
    fp.close()
    
    
    #print gene_to_association

    cloud_to_associations = {}
    for cloud in all_Clouds:
        cloud_to_associations[cloud] = []
        for gene in CLOUD_Targets[cloud]:
            if gene_to_association.has_key(gene):
                cloud_to_associations[cloud].extend(gene_to_association[gene])
    

    
    drugpairs = {}
    drugpairs_num = {}
    drugpairs_hasOverlap = {}
    for d1 in all_Clouds:
        
        for d2 in all_Clouds:
            if d1 < d2:
                #print d1
                #print d2
                #print CLOUD_Targets[d1]
                #print CLOUD_Targets[d2]
                
                
                associations1 = set(cloud_to_associations[d1])
                associations2 = set(cloud_to_associations[d2])
                #print associations1
                #print associations2
                #print '--'
                #print associations1.union(associations2)
                union = float(len(associations1.union(associations2)))
                #print union
                if union == 0.0:
                    result = 0
                else:
                    result = len(associations1.intersection(associations2))/union
                    #print result
                drugpairs[d1+','+d2] = result
                #print '--'
                
                drugpairs_num[d1+','+d2] = len(associations1.intersection(associations2))
                
                if len(associations1.intersection(associations2)) > 0:
                    drugpairs_hasOverlap[d1+','+d2] = 1
                else:
                    drugpairs_hasOverlap[d1+','+d2] = 0
        
     
    msigDB_Final[dataname] = drugpairs
    msigDB_Final[dataname+'_num'] = drugpairs_num
    msigDB_Final[dataname+'_Overlap'] = drugpairs_hasOverlap
print 'Finished Loading msigDB annotations'

MsigD_MF.gmt
MsigD_KeGG.gmt
MsigD_BP.gmt
MsigD_CC.gmt
Msig_ChemGen_Perturbation.gmt
Finished Loading msigDB annotations


In [6]:
print msigDB_Final.keys()

['MsigD_BP', 'Msig_ChemGen_Perturbation_Overlap', 'Msig_ChemGen_Perturbation_num', 'MsigD_CC_num', 'MsigD_MF_Overlap', 'Msig_ChemGen_Perturbation', 'MsigD_BP_Overlap', 'MsigD_BP_num', 'MsigD_CC_Overlap', 'MsigD_MF', 'MsigD_KeGG_Overlap', 'MsigD_MF_num', 'MsigD_CC', 'MsigD_KeGG_num', 'MsigD_KeGG']


### 4. KeGG

In [7]:
CLOUD_to_Kegg = {'Direct':{},'Indirect':{}}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_KeGG_Pathways.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    CLOUD_to_Kegg['Direct'][tmp[0]] = [x for x in tmp[2].split(';') if x != '']
    CLOUD_to_Kegg['Indirect'][tmp[0]] = [x for x in tmp[3].split(';') if x != '']

    
KeGG_final = {}
for associationType in CLOUD_to_Kegg:
    
    drugpairs = {}
    drugpairs_num = {}
    drugpairs_hasOverlap = {}
    for d1 in all_Clouds:
        for d2 in all_Clouds:
            if d1 < d2:
                
                associations1 = set(CLOUD_to_Kegg[associationType][d1])
                associations2 = set(CLOUD_to_Kegg[associationType][d2])
                
                union = float(len(associations1.union(associations2)))

                if union == 0.0:
                    result = 0
                else:
                    result = len(associations1.intersection(associations2))/union
                    
                drugpairs[d1+','+d2] = result
                
                drugpairs_num[d1+','+d2] = len(associations1.intersection(associations2))
                
                if len(associations1.intersection(associations2)) > 0:
                    drugpairs_hasOverlap[d1+','+d2] = 1
                else:
                    drugpairs_hasOverlap[d1+','+d2] = 0
                
    KeGG_final[associationType] = drugpairs
    KeGG_final[associationType+'_num'] = drugpairs_num
    KeGG_final[associationType+'_Overlap'] = drugpairs_hasOverlap
    
print 'Finished KeGG'

Finished KeGG


### 5. Side Effects (Sider, Offsides, TwoSides)

In [8]:
SideEffect_final = {}


SideEffectFiles = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/', f)) and '.csv' in f]

CLOUD_To_SideEffects = {}
for file in SideEffectFiles:
    name = file.split('.')[0]
    CLOUD_To_SideEffects[name] = {}
    fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/' + file,'r')
    fp.next()
    for line in fp:
        tmp = line.strip().split(',')
        CLOUD_To_SideEffects[name][tmp[0]] = [x for x in tmp[2].split(';') if x != '']
    fp.close()
    
    

for associationType in CLOUD_To_SideEffects:
    drugpairs = {}
    drugpairs_num = {}
    drugpairs_hasOverlap = {}
    for d1 in all_Clouds:
        for d2 in all_Clouds:
            if d1 < d2:
                
                associations1 = set(CLOUD_To_SideEffects[associationType][d1])
                associations2 = set(CLOUD_To_SideEffects[associationType][d2])
                
                union = float(len(associations1.union(associations2)))

                if union == 0.0:
                    result = 0
                else:
                    result = len(associations1.intersection(associations2))/union
                    
                drugpairs[d1+','+d2] = result
                drugpairs_num[d1+','+d2] = len(associations1.intersection(associations2))
                
                
                if len(associations1.intersection(associations2)) > 0:
                    drugpairs_hasOverlap[d1+','+d2] = 1
                else:
                    drugpairs_hasOverlap[d1+','+d2] = 0
                
                
    SideEffect_final[associationType] = drugpairs
    SideEffect_final[associationType+'_num'] = drugpairs_num
    SideEffect_final[associationType+'_Overlap'] = drugpairs_hasOverlap

In [9]:

SideEffectNetworks = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/SideEffectNetworks/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/SideEffectNetworks/', f)) and '.gml' in f]


CLOUD_To_SideEffects_networks = {}
for network in SideEffectNetworks:
    name = network.split('.')[0]
    CLOUD_To_SideEffects_networks[name] = {}
    Network = nx.read_gml('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/SideEffectNetworks/' + network,'r')
    
    
    drugpairs = {}
    for d1 in all_Clouds:
        for d2 in all_Clouds:
            if d1 < d2:
                drugBank_ID1 = CLOUD_To_DrugBank[d1]
                drugBank_ID2 = CLOUD_To_DrugBank[d2]
            
                if Network.has_edge(drugBank_ID1,drugBank_ID2):
                    drugpairs[d1+','+d2] = 1
                else:
                    drugpairs[d1+','+d2] = 0

    SideEffect_final[name] = drugpairs
                    
    
print 'Finished side effects'
    

Finished side effects


### 6. ATC

In [10]:
ATC_final = {}

CLOUD_to_ATC = {'FirstLevel':{},'SecondLevel':{}}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_ATC.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    CLOUD_to_ATC['FirstLevel'][tmp[0]] = set([ x for x in tmp[2].split(';') if x != ''])
    CLOUD_to_ATC['SecondLevel'][tmp[0]] = set([ x for x in tmp[3].split(';') if x != ''])
    
for level in CLOUD_to_ATC:
    drugpairs = {}
    drugpairs_num = {}
    drugpairs_hasOverlap = {}
    for d1 in all_Clouds:
        for d2 in all_Clouds:
            if d1 < d2:
                
                associations1 = set(CLOUD_to_ATC[level][d1])
                associations2 = set(CLOUD_to_ATC[level][d2])
                
                union = float(len(associations1.union(associations2)))

                if union == 0.0:
                    result = 0
                else:
                    result = len(associations1.intersection(associations2))/union
                    
                drugpairs[d1+','+d2] = result
                drugpairs_num[d1+','+d2] = len(associations1.intersection(associations2))
                
                if len(associations1.intersection(associations2)) > 0:
                    drugpairs_hasOverlap[d1+','+d2] = 1
                else:
                    drugpairs_hasOverlap[d1+','+d2] = 0
                            
                
    ATC_final[level] = drugpairs
    ATC_final[level+'_num'] = drugpairs_num
    ATC_final[level+'_Overlap'] = drugpairs_hasOverlap
print 'Finished ATC associations'

Finished ATC associations


### 7. DrugBank (Transporters, Enzymes, Carriers)
Use information from DrugBank to find Transporters, Carriers and Enzymes

In [11]:
DrugBank_final = {}

#####
## Combined Filtered
CLOUD_Complete_Associations = {'Transporters':{},'Enzymes':{},'Carriers':{}}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_TargetsSplit.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    CLOUD_Targets[tmp[0]] = tmp[1].split(';')
    CLOUD_Complete_Associations['Transporters'][tmp[0]] = [x for x in tmp[2].split(';') if x != '']
    CLOUD_Complete_Associations['Enzymes'][tmp[0]] = [x for x in tmp[3].split(';') if x != '']
    CLOUD_Complete_Associations['Carriers'][tmp[0]] = [x for x in tmp[4].split(';') if x != '']
fp.close()

for ProteinType in CLOUD_Complete_Associations:
    drugpairs = {}
    drugpairs_num = {}
    drugpairs_hasOverlap = {}
    for d1 in all_Clouds:
        for d2 in all_Clouds:
            if d1 < d2:
                
                associations1 = set(CLOUD_Complete_Associations[ProteinType][d1])
                associations2 = set(CLOUD_Complete_Associations[ProteinType][d2])
                
                union = float(len(associations1.union(associations2)))

                if union == 0.0:
                    result = 0
                else:
                    result = len(associations1.intersection(associations2))/union
                    
                drugpairs[d1+','+d2] = result
                drugpairs_num[d1+','+d2] = len(associations1.intersection(associations2))
                
                if len(associations1.intersection(associations2)) > 0:
                    drugpairs_hasOverlap[d1+','+d2] = 1
                else:
                    drugpairs_hasOverlap[d1+','+d2] = 0
                
    DrugBank_final[ProteinType] = drugpairs
    DrugBank_final[ProteinType+'_num'] = drugpairs_num
    DrugBank_final[ProteinType+'_Overlap'] = drugpairs_hasOverlap

print  'Finished DrugBank'

Finished DrugBank


### 8. GO 
Use whole ontology from GO (not just leaves from msigDB)

In [12]:
def Calculate_Similarity_MaxSpecificity(targets1, targets2, GO_genes_annotation, GO_Association_UP,isSeparation = False):
    sims = []
    for t1 in targets1:
        for t2 in targets2:
            if t1 > t2 or isSeparation:
                if len([len(GO_genes_annotation[x]) for x in set(GO_Association_UP[t1]).intersection(GO_Association_UP[t2])]) == 0:
                    SimDis = 0
                else:
                    SimDis = 2.0 / min([len(GO_genes_annotation[x]) for x in set(GO_Association_UP[t1]).intersection(GO_Association_UP[t2])])
                sims.append(SimDis)
    if len(sims) > 0:
        return np.mean(sims)
    else:
        return  0

In [14]:
#GO_Terms
GO_Similarities_final = {}
for go_branch in ['Component','Function','Process']:
    print '\t' + go_branch
    GO_Similarities_final[go_branch] = {}
    
    print 'Load GO Associations:'
    GO_Association_UP,GO_genes_annotation = GO.getAllGene_Annotation(go_branch)
    print 'Done'
    
    
    for d1 in all_Clouds:
        for d2 in all_Clouds:
            if d1 < d2:
                targets1 = CLOUD_Targets[d1]
                targets2 = CLOUD_Targets[d2]
                if len(targets1) > 0 and len(targets2) > 0:
                    sim = Calculate_Similarity_MaxSpecificity(targets1, targets2, GO_genes_annotation,GO_Association_UP, isSeparation=True)
                    GO_Similarities_final[go_branch][d1+','+d2] = sim
                else:
                    GO_Similarities_final[go_branch][d1+','+d2] = 0
                    
print  'Finished GO ontology associations'

	Component
Load GO Associations:
Done
	Function
Load GO Associations:
Done
	Process
Load GO Associations:
Done
Finished GO ontology associations


### 9. Disease Ontology

In [15]:
#Disease
Disease_Similarities_final = {}
print 'Load Disease Associations:'
Disease_Association_UP,d_diseases_annotation = GO.getAllGene_Disease_Annotation()
print 'Done'
for d1 in all_Clouds:
    for d2 in all_Clouds:
        if d1 < d2:
            targets1 = CLOUD_Targets[d1]
            targets2 = CLOUD_Targets[d2]
            if len(targets1) > 0 and len(targets2) > 0:
                sim = Calculate_Similarity_MaxSpecificity(targets1, targets2, d_diseases_annotation,Disease_Association_UP, isSeparation=True)
                Disease_Similarities_final[d1+','+d2] = sim
            else:
                Disease_Similarities_final[d1+','+d2] = 0
print  'Finished Disease associations'

Load Disease Associations:
Done
Finished Disease associations


### 10. Create Final Combined File

In [16]:
#Load known interactions
#read the interactions, if multiple interactions between two drugs append with ;
Interactions = {}
fp = open('../data/Create_CLOUD_Pair_Annotation/InteractionsOverview_DPI_iS3_pS7_abMAD2_gP100.csv','r')
fp.next()
for line in fp:
    tmp = line.strip().split(',')

    if Interactions.has_key(tmp[0]+','+tmp[1]):
        Interactions[tmp[0]+','+tmp[1]] = Interactions[tmp[0]+','+tmp[1]]+';'+tmp[3]
    else:
        Interactions[tmp[0]+','+tmp[1]] = tmp[3]
fp.close()
print 'Number of Interactions (Interacting drug pairs): %d' %len(Interactions)




PPI_Metrics = list(PPI_final.keys())
PPI_TargetSets = list(PPI_final[PPI_Metrics[0]])
MsigAssociations = list(msigDB_Final.keys())
KeGGHierarchies = list(KeGG_final.keys())
SideEffectTypes = list(SideEffect_final.keys())
ATCLevels = list(ATC_final.keys())
DrugBankProteins = list(DrugBank_final.keys())
GO_Types = list(GO_Similarities_final.keys())

header = 'Drug1,Drug2,IsInteracting,InteractionTypes'
header =  header +',' + 'ChemicalSimilarity'
for m in PPI_Metrics:
    for t in PPI_TargetSets:
        header = header +',PPI_' +m+'_'+t
header = header +',' +','.join(MsigAssociations)
header = header +',' +','.join(['KeGG_'+x for x in KeGGHierarchies])
header = header +',' +','.join(['SideEffects_'+x for x in SideEffectTypes])
header = header +',' +','.join(['ATC_'+x for x in ATCLevels])
header = header +',' +','.join(DrugBankProteins)
header =  header +',' +','.join(['GO_'+x for x in GO_Types])
header =  header +',' + 'Disease\n'


fp_out = open('../results/Create_CLOUD_Pair_Annotation/DrugPair_Feature_Overview.csv','w')
fp_out.write(header)

for d1 in all_Clouds:
    for d2 in all_Clouds:
        if d1 < d2:
            
            fp_out.write(d1+','+d2)
            
            #check if in either direction the drug pair has found interactions
            is_interacting = 0
            Inttype = ['None']
            if d1+','+d2 in Interactions.keys():
                is_interacting = 1
                Inttype.extend(Interactions[d1+','+d2].split(';'))
            if  d2+','+d1 in Interactions.keys():
                is_interacting = 1
                Inttype.extend(Interactions[d2+','+d1].split(';'))
            if len(Inttype) > 1:
                Inttype.remove('None')
            Inttype.sort()
                
            
            fp_out.write(','+str(is_interacting)+','+';'.join(Inttype))
            
            # 1. Chemical Similarity
            chem = chemSim_final[d1+','+d2]
            fp_out.write(','+str(chem))
            
            # 2. PPI Distances
            for metric in PPI_Metrics:
                for targetSet in PPI_TargetSets:
                    Distance = PPI_final[metric][targetSet][d1+','+d2]
                    fp_out.write(','+str(Distance))

            # 3. msigDB
            for association in MsigAssociations:
                overlap = msigDB_Final[association][d1+','+d2]
                fp_out.write(','+str(overlap))

            # 4. KeGG
            for hierarchy in KeGGHierarchies:
                pathwayOverlap = KeGG_final[hierarchy][d1+','+d2]
                fp_out.write(','+str(pathwayOverlap))
                
            # 5. Side Effects
            for sideEffectType in SideEffectTypes:
                sideEffectOverlap = SideEffect_final[sideEffectType][d1+','+d2]
                fp_out.write(','+str(sideEffectOverlap))
                
            # 6. ATC 
            for level in ATCLevels:
                ATC_overlap = ATC_final[level][d1+','+d2]
                fp_out.write(','+str(ATC_overlap))
            
            # 7. DrugBank
            for proteinType in DrugBankProteins:
                proteinTypeOverlap = DrugBank_final[proteinType][d1+','+d2]
                fp_out.write(','+str(proteinTypeOverlap))
            
            # 8. GO
            for GO_Type in GO_Types:
                GO_Overlap = GO_Similarities_final[GO_Type][d1+','+d2]
                fp_out.write(','+str(GO_Overlap))
                
            # 8. Disease
            Disease_Overlap = Disease_Similarities_final[d1+','+d2]
            fp_out.write(','+str(Disease_Overlap) +'\n')
                
            
fp_out.close()
print 'Created Drug Pair Annotation File'

Number of Interactions (Interacting drug pairs): 1817
Created Drug Pair Annotation File


## Single Drug Properties

### 1. Targets

In [17]:
cloud_targets = {'All':{},'Filtered':{}}

fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_All_Targets.csv','r')
fp.next()
all_Clouds = set()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets['All'][tmp[0]] = list(set(tmp[2].split(';')))
    all_Clouds.add(tmp[0])
fp.close()    
all_Clouds = list(all_Clouds)
all_Clouds.sort()



#####
## Standard CLOUD targets = Combined Filtered
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_TargetsSplit.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    cloud_targets['Filtered'][tmp[0]] = [x for x in tmp[1].split(';') if x != '']
    
    
    

### 2. PPI

In [18]:
PPI = nx.read_gml('../data/Create_CLOUD_Pair_Annotation/Annotations/Human_Interactome.gml')

In [19]:
PPI_results = {'AvgDegree':{},'AvgClustering':{}}

for TargetSet in cloud_targets:
    
    result_dic_degree = {}
    result_dic_clustering = {}
    for cloud in all_Clouds:
        degrees = [x[1] for x in nx.degree(PPI,cloud_targets[TargetSet][cloud])]
       
        result_degree = 'nan'
        result_clustering = 'nan'
        if len(degrees) > 0:
            result_degree = np.mean(degrees)
            result_clustering =  nx.average_clustering(PPI,cloud_targets[TargetSet][cloud])
            
        result_dic_degree[cloud] = result_degree
        result_dic_clustering[cloud] = result_clustering
    
    PPI_results['AvgDegree'][TargetSet] = result_dic_degree
    PPI_results['AvgClustering'][TargetSet] = result_dic_clustering            

### 3. MsigDB

In [20]:
msigDB_results = {}

msigDBFiles = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/MSigDB/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/MSigDB/', f)) and '.gmt' in f]

for TargetSet in cloud_targets:
    msigDB_results[TargetSet] = {}
    for file in msigDBFiles:
        #print file
        dataname = file.split('.')[0]
       

        
        #Find the gene to perturbation associated (one gene can have various associated perturbations)
        fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/MSigDB/' + file,'r')
        gene_to_association = {}
        for line in fp:
            tmp = line.strip().split('\t')
            for gene in [x for x in tmp[2:] if x != '']:
                if gene_to_association.has_key(gene):
                    gene_to_association[gene].append(tmp[0])
                else:
                    gene_to_association[gene] = [tmp[0]]
        fp.close()



        cloud_to_associations = {}
        for cloud in all_Clouds:
            cloud_to_associations[cloud] = []
            for gene in cloud_targets[TargetSet][cloud]:
                if gene_to_association.has_key(gene):
                    cloud_to_associations[cloud].extend(gene_to_association[gene])
        
        cloud_to_associations_count = {}
        for cloud in cloud_to_associations:
            cloud_to_associations_count[cloud] = len(set(cloud_to_associations[cloud]))
    

        msigDB_results[TargetSet][dataname] = cloud_to_associations_count

### 4. KeGG

In [21]:
CLOUD_to_Kegg_Results = {'Direct':{},'Indirect':{}}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_KeGG_Pathways.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    CLOUD_to_Kegg_Results['Direct'][tmp[0]] = len([x for x in tmp[2].split(';') if x != ''])
    CLOUD_to_Kegg_Results['Indirect'][tmp[0]] = len([x for x in tmp[3].split(';') if x != ''])

### 5. GO

In [22]:
GO_Results_Count = {}

for TargetSet in cloud_targets:
    GO_Results_Count[TargetSet] = {}
    
    #GO_Terms
    for go_branch in ['Component','Function','Process']:
        print '\t' + go_branch

        print 'Load GO Associations:'
        GO_Association_UP,GO_genes_annotation = GO.getAllGene_Annotation(go_branch)
        print 'Done'

        for cloud in all_Clouds:
            cloud_to_associations[cloud] = []
            for gene in cloud_targets[TargetSet][cloud]:
                if gene_to_association.has_key(gene):
                    cloud_to_associations[cloud].extend(GO_Association_UP[gene])
        
        cloud_to_associations_count = {}
        for cloud in cloud_to_associations:
            cloud_to_associations_count[cloud] = len(set(cloud_to_associations[cloud]))
    

        GO_Results_Count[TargetSet][go_branch] = cloud_to_associations_count
        

	Component
Load GO Associations:
Done
	Function
Load GO Associations:
Done
	Process
Load GO Associations:
Done
	Component
Load GO Associations:
Done
	Function
Load GO Associations:
Done
	Process
Load GO Associations:
Done


### 6. Disease

In [23]:
Disease_Results_Count = {}

for TargetSet in cloud_targets: 
    print 'Load Disease Associations:'
    Disease_Association_UP,d_diseases_annotation = GO.getAllGene_Disease_Annotation()
    print 'Done'
    
    for cloud in all_Clouds:
        cloud_to_associations[cloud] = []
        for gene in cloud_targets[TargetSet][cloud]:
            if gene_to_association.has_key(gene):
                cloud_to_associations[cloud].extend(Disease_Association_UP[gene])

    cloud_to_associations_count = {}
    for cloud in cloud_to_associations:
        cloud_to_associations_count[cloud] = len(set(cloud_to_associations[cloud]))


    Disease_Results_Count[TargetSet] = cloud_to_associations_count


Load Disease Associations:
Done
Load Disease Associations:
Done


### 7. SideEffects

In [24]:
SideEffect_results = {}

SideEffectFiles = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/', f)) and '.csv' in f]

for file in SideEffectFiles:
    name = file.split('.')[0]
    SideEffect_results[name] = {}
    fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/' + file,'r')
    fp.next()
    for line in fp:
        tmp = line.strip().split(',')
        SideEffect_results[name][tmp[0]] = len([x for x in tmp[2].split(';') if x != ''])
    fp.close()
    


In [25]:
SideEffectNetworks = [f for f in os.listdir('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/SideEffectNetworks/') if os.path.isfile(os.path.join('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/SideEffectNetworks/', f)) and '.gml' in f]


for network in SideEffectNetworks:
    name = network.split('.')[0]
    CLOUD_To_SideEffects_networks[name] = {}
    Network = nx.read_gml('../data/Create_CLOUD_Pair_Annotation/Annotations/Side_Effects/SideEffectNetworks/' + network,'r')
    
    
    drug_results = {}
    for cloud in all_Clouds:
        drugBank_ID = CLOUD_To_DrugBank[cloud]
        drug_results[cloud] = len(Network.edges(drugBank_ID))
        

    SideEffect_results[network] = drug_results
                    

In [26]:
print SideEffect_results.keys()

['TwoSide_CLOUDs.gml', 'CLOUD_to_Offsides', 'Drugbank_2018-07-03_CLOUD_Only.gml', 'CLOUD_to_SIDER']


### 8. ATC

In [27]:
ATC_results = {}

ATC_results = {'FirstLevel':{},'SecondLevel':{}}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_ATC.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    ATC_results['FirstLevel'][tmp[0]] = len(set([ x for x in tmp[2].split(';') if x != '']))
    ATC_results['SecondLevel'][tmp[0]] = len(set([ x for x in tmp[3].split(';') if x != '']))

### 9. Kinetics

In [28]:
kinetics_final = {}

#####
## Combined Filtered
kinetics_final = {'Transporters':{},'Enzymes':{},'Carriers':{}}
fp = open('../data/Create_CLOUD_Pair_Annotation/Annotations/CLOUD_to_TargetsSplit.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    CLOUD_Targets[tmp[0]] = tmp[1].split(';')
    kinetics_final['Transporters'][tmp[0]] = len([x for x in tmp[2].split(';') if x != ''])
    kinetics_final['Enzymes'][tmp[0]] = len([x for x in tmp[3].split(';') if x != ''])
    kinetics_final['Carriers'][tmp[0]] = len([x for x in tmp[4].split(';') if x != ''])
fp.close()

print  'Finished Kinetics'

Finished Kinetics


### 10. Single Drug Property File

In [29]:
DPI = nx.read_gml('../data/Create_CLOUD_Pair_Annotation/DPI_Network_Complete.gml')


TargetSets = list(cloud_targets.keys())
PPI_Decscriptor = list(PPI_results.keys())
MsigAssociations = list(msigDB_results[TargetSets[0]].keys())
KeGGHierarchies = list(CLOUD_to_Kegg_Results.keys())
SideEffectTypes = list(SideEffect_results.keys())
ATCLevels = list(ATC_results.keys())
DrugBankProteins = list(kinetics_final.keys())
GO_Types = list(GO_Results_Count[TargetSets[0]].keys())

header = 'Drug,NumberInteractions,NumberIncreasing,NumberDecreasing,NumberEmergent'
header = header +',' +','.join(TargetSets)

#print [y for y in [x for x in TargetSets]]
for t in TargetSets:
    for m in MsigAssociations:
        header = header +',' + t+'_'+m
        
for p in PPI_Decscriptor:
    for t in TargetSets:
        header = header +',' + t+'_'+p
        
for t in TargetSets:
    for g in GO_Types:
        header = header +',' + t+'_'+g
        
for t in TargetSets:
        header = header +',' + t+'_Disease'  

header = header +',' +','.join(['KeGG_'+x for x in KeGGHierarchies])
header = header +',' +','.join(['SideEffects_'+x for x in SideEffectTypes])
header = header +',' +','.join(['ATC_'+x for x in ATCLevels])
header = header +',' +','.join(DrugBankProteins) +'\n'


fp_out = open('../results/Create_CLOUD_Pair_Annotation/SingleDrug_Feature_Overview.csv','w')
fp_out.write(header)

for cloud in all_Clouds:
    
    interactions = DPI.edges(cloud)
    interactions_types = []
    for i in list(set(interactions)):
        for key in DPI[i[0]][i[1]]:
            interactions_types.append(DPI[i[0]][i[1]][key]['Type'])
    number_interactions = len(interactions)
    
    fp_out.write(cloud+','+str(number_interactions)+','+str(interactions_types.count('Increasing'))+','+str(interactions_types.count('Decreasing'))+','+str(interactions_types.count('Emergent')))
    
    for t in TargetSets:
        fp_out.write(','+str(len(cloud_targets[t][cloud])))
                     

    for t in TargetSets:
        for m in MsigAssociations:
            fp_out.write(','+str(msigDB_results[t][m][cloud]))

    for p in PPI_Decscriptor:
        for t in TargetSets:
            fp_out.write(','+str(PPI_results[p][t][cloud]))

    for t in TargetSets:
        for g in GO_Types:
            fp_out.write(','+str(GO_Results_Count[t][g][cloud]))

    for t in TargetSets:
            fp_out.write(','+str(Disease_Results_Count[t][cloud]))
                         
    for k in KeGGHierarchies:
        fp_out.write(','+str(CLOUD_to_Kegg_Results[k][cloud]))
                     
    for k in SideEffectTypes:
        fp_out.write(','+str(SideEffect_results[k][cloud]))
    
    for k in ATCLevels:
        fp_out.write(','+str(ATC_results[k][cloud]))
            
    for k in DrugBankProteins:
        fp_out.write(','+str(kinetics_final[k][cloud]))
    
    fp_out.write('\n')
fp_out.close()
        