In [14]:
import networkx as nx
import numpy as np
from matplotlib import pylab as plt
import os
import seaborn as sns
from scipy.stats import mannwhitneyu as mu

### Load PPI and Targets

In [15]:
PPI = nx.read_gml('../data/CheckBestTargetSet/Human_Interactome.gml')

Load all the different drug targets from the various sources

In [16]:
#Dictionary with the CLOUD : targets
targets_DrugBank = {}
targets_DrugBank_Filtered = {}
targets_Pubchem = {}
targets_Pubchem_Filtered = {}
targets_Chembl = {}
targets_Chembl_Filtered = {}
targets_All_Filtered = {}
targets_All = {}

#Get all extracted targets (with the DrugBank target split)
targets_only = set()
fp = open('../data/CheckBestTargetSet/TargetSets/CLOUD_to_TargetsSplit.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    targets_All_Filtered[tmp[0]] = [x for x in tmp[1].split(';') if x != '']
    
    targets_only.update([x for x in tmp[1].split(';') if x != ''])
    
    targets_All[tmp[0]] = [x for x in tmp[1].split(';') if x != '']
    targets_All[tmp[0]].extend([x for x in tmp[2].split(';') if x != ''])
    targets_All[tmp[0]].extend([x for x in tmp[3].split(';') if x != ''])
    targets_All[tmp[0]].extend([x for x in tmp[4].split(';') if x != ''])
fp.close()
  
#
# DRUGBANK
#
fp = open('../data/CheckBestTargetSet/TargetSets/CLOUD_DrugBank_Targets.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    targets_DrugBank[tmp[0]] = [x for x in tmp[2].split(';') if x != '']
    targets_DrugBank_Filtered[tmp[0]] = [x for x in tmp[2].split(';') if x != '' and x in targets_All_Filtered[tmp[0]]]
fp.close()


#
# PUBCHEM
#
fp = open('../data/CheckBestTargetSet/TargetSets/CLOUD_PubChem_Targets.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    targets_Pubchem[tmp[0]] = [x for x in tmp[2].split(';') if x != '']
    targets_Pubchem_Filtered[tmp[0]] = [x for x in tmp[2].split(';') if x != '' and x in targets_All_Filtered[tmp[0]]]
fp.close()

#
# CHEMBL
#
fp = open('../data/CheckBestTargetSet/TargetSets/CLOUD_ChEMBL_Targets.csv')
fp.next()
for line in fp:
    tmp = line.strip().split(',')
    targets_Chembl[tmp[0]] =[x for x in tmp[2].split(';') if x != '']
    targets_Chembl_Filtered[tmp[0]] = [x for x in tmp[2].split(';') if x != '' and x in targets_All_Filtered[tmp[0]]]
fp.close()  

    
#Make a list with all clouds
all_Clouds = targets_All.keys()
all_Clouds.sort()

### Calculate the various distance measurements

In [17]:
saved_distances = {}

def Check_Drug_Module_Diameter(PPI,targets):
    '''
    Extract the min path between targets (=Diameter)
    This is always the minimum path between one target and any other target of the same set.
    Returns Mean of all paths (d_d) as well as paths (min_paths)
    
    This function uses only one set hence calulcates the intra drug distance or drug_module diamter
    
    '''
    filtered_targets = []
    for t in targets:
        if PPI.has_node(t):
            filtered_targets.append(t)

    min_paths = []
    if len(filtered_targets) > 1:
        try:
            for t1 in filtered_targets:
                min_distances = []
                for t2 in filtered_targets:
                    if t1 != t2:
                        #print nx.shortest_path(PPI,t1,t2)
                        if saved_distances.has_key(t1+','+t2):
                            min_distances.append(saved_distances[t1+','+t2])
                        elif saved_distances.has_key(t2+','+t1):
                            min_distances.append(saved_distances[t2+','+t1])
                        elif nx.has_path(PPI,t1,t2):
                            dist_path_length = len(nx.shortest_path(PPI,t1,t2))-1
                            min_distances.append(dist_path_length)
                            saved_distances[t1+','+t2] = dist_path_length
   
                min_paths.append(min(min_distances))
            d_d = sum(min_paths)/float(len(filtered_targets))

            return d_d
        except:
            return "None"
    else:
        return 0

In [18]:
def Check_Shortest_DistancesBetween(PPI, targets1, targets2):
    '''
    Extract the min path between targets.
    This is always the minimum path between one target and any other target of the other set.
    Returns Mean of all paths (d_d) as well as paths (min_paths)
    
    This function uses two sets hence calulcates the inter drug distance
    
    '''
    filtered_targets = []
    for t in targets1:
        if PPI.has_node(t):
            filtered_targets.append(t)

    filtered_targets2 = []
    for t in targets2:
        if PPI.has_node(t):
            filtered_targets2.append(t)

    min_paths = []
    if len(filtered_targets) >= 1 and len(filtered_targets2) >= 1:
        try:
            for t1 in filtered_targets:
                min_distances = []
                for t2 in filtered_targets2:
                    # print nx.shortest_path(PPI,t1,t2)
                    if saved_distances.has_key(t1+','+t2):
                        min_distances.append(saved_distances[t1+','+t2])
                    elif saved_distances.has_key(t2+','+t1):
                        min_distances.append(saved_distances[t2+','+t1])
                    elif nx.has_path(PPI,t1,t2):
                        dist_path_length = len(nx.shortest_path(PPI,t1,t2))-1
                        min_distances.append(dist_path_length)
                        saved_distances[t1+','+t2] = dist_path_length  
                if len(min_distances) != 0:
                    min_paths.append(min(min_distances))
            return min_paths
        except:
            return 'None'
    else:
        return 'None'

In [19]:
def calculate_ClosestDistance(PPI,targets1, targets2 ):
    '''
    Add information here
    '''
    filtered_targets = []
    for t in targets1:
        if PPI.has_node(t):
            filtered_targets.append(t)

    filtered_targets2 = []
    for t in targets2:
        if PPI.has_node(t):
            filtered_targets2.append(t)
    
    
    distances = []
    if len(filtered_targets) > 0 and len(filtered_targets2) > 0:
        for t1 in filtered_targets:
            tmp = []
            for t2 in filtered_targets2:
                
                if saved_distances.has_key(t1+','+t2):
                    tmp.append(saved_distances[t1+','+t2])
                elif saved_distances.has_key(t2+','+t1):
                    tmp.append(saved_distances[t2+','+t1])
                elif nx.has_path(PPI,t1,t2):
                    dist_path_length = len((nx.shortest_path(PPI, source=t1, target=t2))) - 1
                    tmp.append(dist_path_length)
                    saved_distances[t1+','+t2] = dist_path_length
            if len(tmp) != 0:
                distances.append(min(tmp))

    if len(distances) == 0:
        result = 'None'
    else:
        result = np.mean(distances)
        
    return result

In [20]:
def calculate_MeanDistance(PPI,targets1, targets2 ):
    '''
    Add information here
    '''
    filtered_targets = []
    for t in targets1:
        if PPI.has_node(t):
            filtered_targets.append(t)

    filtered_targets2 = []
    for t in targets2:
        if PPI.has_node(t):
            filtered_targets2.append(t)
    



    distances = []
    for t1 in filtered_targets:
        for t2 in filtered_targets2:
            
            
            if saved_distances.has_key(t1+','+t2):
                distances.append(saved_distances[t1+','+t2])
            elif saved_distances.has_key(t2+','+t1):
                distances.append(saved_distances[t2+','+t1])
            elif nx.has_path(PPI,t1,t2):
                dist_path_length = len((nx.shortest_path(PPI, source=t1, target=t2))) - 1
                distances.append(dist_path_length)
                saved_distances[t1+','+t2] = dist_path_length
    if len(distances) > 0:
        result = np.mean(distances)
    else:
        result = 'None'
        
    return result


# Calculate All Distances

In [21]:
dic_target_sets = {'DrugBank':targets_DrugBank, 'PubChem':targets_Pubchem, 'Chembl':targets_Chembl,'DrugBank_Filtered':targets_DrugBank_Filtered, 'PubChem_Filtered':targets_Pubchem_Filtered, 'Chembl_Filtered':targets_Chembl_Filtered, 'All_Filtered':targets_All_Filtered, 'All':targets_All}

for key in dic_target_sets:
    print key
    
    
    #Open corresponding result file
    fp_out = open('../results/CheckBestTargetSet/'+key+'.csv','w')
    fp_out.write('Drug1,Drug2,d_A,d_B,d_AB,s_AB,AB_Min,AB_Mean\n')

    #Go though all pairs
    for cloud1 in all_Clouds:
        print cloud1
        #Targets of drug A
        targets1 = dic_target_sets[key][cloud1]

        #Diameter of drug A
        d_A =  Check_Drug_Module_Diameter(PPI, targets1)

        for cloud2 in all_Clouds:

            #only calculate the half matrix
            if cloud1 < cloud2:

                #targets of drug B
                targets2 = dic_target_sets[key][cloud2]

                #Diameter of drug B
                d_B =  Check_Drug_Module_Diameter(PPI, targets2)

                #Min distance from A to B
                distances1 = Check_Shortest_DistancesBetween(PPI, targets1, targets2)
                #Min distance from B to A
                distances2 = Check_Shortest_DistancesBetween(PPI, targets2, targets1)


                if distances1 != "None" and distances2 != 'None':
                    #Dab
                    between_Distance = (sum(distances1)+sum(distances2))/float((len(distances1)+len(distances2)))
                else:
                    between_Distance = "None"

                if d_A != "None" and d_B != 'None' and between_Distance != "None":
                    #Sab
                    separation = between_Distance - (d_A+d_B)/2.0
                else:
                    separation = 'None'

                #Create AB_Min
                min_Distance = calculate_ClosestDistance(PPI, targets1, targets2)

                #Create AB_Mean
                mean_Distance = calculate_MeanDistance(PPI, targets1, targets2)

                #Save results
                fp_out.write(cloud1+','+cloud2+','+str(d_A)+','+str(d_B)+','+str(between_Distance)+','+str(separation)+','+str(min_Distance)+','+str(mean_Distance)+'\n')

    fp_out.close()    

        
        

DrugBank
CLOUD001
CLOUD002
CLOUD003
CLOUD004
CLOUD005
CLOUD006
CLOUD007
CLOUD008
CLOUD009
CLOUD010
CLOUD011
CLOUD012
CLOUD013
CLOUD014
CLOUD015
CLOUD016
CLOUD017
CLOUD018
CLOUD019
CLOUD020
CLOUD021
CLOUD022
CLOUD023
CLOUD024
CLOUD025
CLOUD026
CLOUD027
CLOUD028
CLOUD029
CLOUD030
CLOUD031
CLOUD032
CLOUD033
CLOUD034
CLOUD035
CLOUD036
CLOUD037
CLOUD038
CLOUD039
CLOUD040
CLOUD041
CLOUD042
CLOUD043
CLOUD044
CLOUD045
CLOUD046
CLOUD047
CLOUD048
CLOUD049
CLOUD050
CLOUD051
CLOUD052
CLOUD053
CLOUD054
CLOUD055
CLOUD056
CLOUD057
CLOUD058
CLOUD059
CLOUD060
CLOUD061
CLOUD062
CLOUD063
CLOUD064
CLOUD065
CLOUD066
CLOUD067
CLOUD068
CLOUD069
CLOUD070
CLOUD071
CLOUD072
CLOUD073
CLOUD074
CLOUD075
CLOUD076
CLOUD077
CLOUD078
CLOUD079
CLOUD080
CLOUD081
CLOUD082
CLOUD083
CLOUD084
CLOUD085
CLOUD086
CLOUD087
CLOUD088
CLOUD089
CLOUD090
CLOUD091
CLOUD092
CLOUD093
CLOUD094
CLOUD095
CLOUD096
CLOUD097
CLOUD098
CLOUD099
CLOUD100
CLOUD101
CLOUD102
CLOUD103
CLOUD104
CLOUD105
CLOUD106
CLOUD107
CLOUD108
CLOUD109
CLOUD110
C

## Calculate the different metrics for the different target sets
TargetSets: All, Chembl, PubChem, DrugBank (all associations and target only filtered)  
Metrics: S_AB, D_AB, Min_AB and Mean_AB

In [132]:
#network = nx.read_gml('../data/Check_Features/DrugPairFeature_Files/DPI_iS3_pS7_abMAD2_gP100/Networks/DPI_Network_CoreToPeriphery.gml')

In [22]:
targetLists = [f for f in os.listdir('../results/CheckBestTargetSet/') if os.path.isfile(os.path.join('../results/CheckBestTargetSet/', f)) and '.csv' in f]
distance_metric = {'D_AB':4, 'S_AB':5, 'Min_AB':6, 'Mean_AB':7}
interaction_colors = {'Increasing':'#ACD900','Decreasing':'#F70020','Emergent':'#0096FF','All':'grey'}
network_parts = ['Complete','Core','CoreToPeriphery','Periphery']

for part in network_parts:
    print part
    network = nx.read_gml('../data/CheckBestTargetSet/DrugPairFeature_Files/DPI_iS3_pS7_abMAD2_gP100/Networks/DPI_Network_'+part+'.gml')

    
     #create the directory if not existing
    directory = os.path.dirname('../results/CheckBestTargetSet/Results/'+part +'/')
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    
    fp_out = open('../results/CheckBestTargetSet/Results/'+part+'/StatisticResult.csv','w')
    fp_out.write('Metric,TargetSet,Type1,Type2,Foldchange,Pvalue,IsSignificant\n')

    #Go through all metrics and target sets
    print 'Calculate Metrics:'
    for metric in distance_metric.keys():

        for targetList in targetLists:


            #check if S_AB (as only sab has negative values)
            if metric != 'S_AB':
                distance_cutoffs = [5,4,3,2,1,0]
            else:
                distance_cutoffs = [3.5,2.5,1.5,0.5,-0.5,-1.5]


            #remove .csv from file name
            targetName = targetList.split('.')[0]

            #create the directory if not existing
            directory = os.path.dirname('../results/CheckBestTargetSet/Results/'+part +'/'+ targetName + '/')
            if not os.path.exists(directory):
                os.makedirs(directory)

            #create a dictionary with the respective distance for a given drug pair
            #all values contains all durg pair values (needed for normalization later)
            all_values = []
            fp = open('../results/CheckBestTargetSet/' + targetList,'r')
            fp.next()
            drugpairs = {}
            for line in fp:
                tmp = line.strip().split(',')
                value = tmp[distance_metric[metric]]
                #print tmp

                drugpairs[tmp[0]+','+tmp[1]] = value
                drugpairs[tmp[1]+','+tmp[0]] = value

                if value != "None":
                    all_values.append(float(value))

            #Split info into the various interaction types
            interaction_types = ['Increasing','Decreasing','Emergent','All']
            interaction_type_results = {}
            for it in  interaction_types:
                
                #binarize the data into the correspodning bins; normalize is used to later take care of the fact that most interaction have a distance around 2
                results = {}
                to_normalize = {}
                interaction_type_results[it] = []
                
                #Go through the cutoffs
                for i in range(1, len(distance_cutoffs)):

                    #this will contain the actual results; integer later number of interaction within this distance
                    results[distance_cutoffs[i]] = 0
                    
                    #get the corresponding results
                    to_normalize[distance_cutoffs[i]] = len([x for x in all_values if x < distance_cutoffs[i-1] and x >= distance_cutoffs[i]]) 

                    
                    #Go though all edges of the certain network and add to bin if existing
                    for edge in network.edges():
                        for key in network[edge[0]][edge[1]]:

                            if network[edge[0]][edge[1]][key]['Type'] != it and it != 'All' :
                                continue


                            value = drugpairs.get(edge[0]+','+edge[1],'None')
                            if value != "None":
                                value = float(value)
                                interaction_type_results[it].append(value)
                                if value >= distance_cutoffs[i] and value < distance_cutoffs[i-1]:
                                    results[distance_cutoffs[i]] += 1


                                    
                '''
                PLOT OUTPUT
                '''
                                    
                sorted_distance_cutOffs = list(distance_cutoffs)
                sorted_distance_cutOffs.sort()

                #PLOT THE INDIVDIUAL BAR PLOT WITH X-AXIS = PPI DISTANCE AND Y-AXIS FREQUENCY
                plt.bar([i for i in sorted_distance_cutOffs[:-1] if to_normalize[i] != 0],[results[i]/float(to_normalize[i]) for i in sorted_distance_cutOffs[:-1] if to_normalize[i] != 0], color=interaction_colors[it])
                plt.xlabel('PPI ' + metric)
                plt.ylabel('Percent of all drug pairs within this distance')
                plt.savefig('../results/CheckBestTargetSet/Results/'+part+'/' + targetName + '/'+metric+'_'+it+'_PPI_Distances.pdf', bbox_inches = "tight")
                plt.close()
                #plt.show()


            #quick bug solution (only happens once in the periphery part and not important)
            if len(interaction_type_results['Decreasing']) == 0:
                interaction_type_results['Decreasing'].append(2)
                
            #PLOT A BOX PLOT WITH THE VARIOUS INTERACTION TYPES AS DIFFERENCE
            bplot = sns.boxplot(data=[all_values,interaction_type_results['All'],interaction_type_results['Increasing'],interaction_type_results['Decreasing'],interaction_type_results['Emergent']],orient='h', showfliers = False)

            interaction_types_2 = ['All','Interacting','Increasing','Decreasing','Emergent']
            interaction_colors_2 = ['grey','#F8B301','#ACD900','#F70020','#0096FF']
            color_dict = dict(zip(interaction_types_2, interaction_colors_2))
            for i in range(0,5):
                mybox = bplot.artists[i]
                mybox.set_facecolor(color_dict[interaction_types_2[i]])


            interaction_type_results['AllPairs'] = all_values
            for key1 in interaction_type_results:
                for key2 in interaction_type_results:
                    if key1 > key2:
                        pval = mu(interaction_type_results[key2],interaction_type_results[key1])[1]
                        is_significant = pval < 0.05
                        foldchange = np.mean(interaction_type_results[key2])/np.mean(interaction_type_results[key1])
                        fp_out.write(metric+','+targetName+','+key1+',' +key2 +','+str(foldchange)+',' + str(pval)+','+str(is_significant) + '\n')


            plt.yticks(range(0,5),['All','Interacting','Increasing','Decreasing','Emergent'])
            plt.ylabel('Interaction Type')
            plt.tick_params(axis = 'y', which = 'major', labelsize = 5)
            plt.xlabel(metric)
            plt.savefig('../results/CheckBestTargetSet/Results/'+part +'/'+ targetName + '/'+metric+'_InteractionDifference.pdf', bbox_inches = "tight")
            plt.close()
    fp_out.close()
    print 'Done'

Complete
Calculate Metrics:
Done
Core
Calculate Metrics:
Done
CoreToPeriphery
Calculate Metrics:
Done
Periphery
Calculate Metrics:
Done


## Analyse the result file

In [24]:
interaction_types = ['Increasing','Decreasing','Emergent']
network_parts = ['Complete','Core','CoreToPeriphery','Periphery']


for part in network_parts:
    print part
    results = {}

    fp =  open('../results/CheckBestTargetSet/Results/'+part+'/StatisticResult.csv','r')
    fp.next()
    for line in fp:
        tmp = line.strip().split(',')

        if results.has_key(tmp[0]) == False:
            results[tmp[0]] = {}

        if results[tmp[0]].has_key(tmp[1]) == False:
            results[tmp[0]][tmp[1]] = 0

        if tmp[2] in interaction_types and tmp[3] in interaction_types:
            if tmp[6] == 'True':
                results[tmp[0]][tmp[1]] += 1
        #print tmp
    for metric in results:
        print '\t' + metric
        for targetSet in results[metric]:
            if results[metric][targetSet] == 3:
                print '\t\t' + targetSet

Complete
	Min_AB
		DrugBank_Filtered
	Mean_AB
		PubChem_Filtered
	D_AB
		Chembl_Filtered
		Chembl
		PubChem_Filtered
	S_AB
		PubChem
Core
	Min_AB
		DrugBank
	Mean_AB
		Chembl_Filtered
		Chembl
	D_AB
	S_AB
CoreToPeriphery
	Min_AB
		All_Filtered
		All
		DrugBank
		PubChem
		Chembl_Filtered
		Chembl
		PubChem_Filtered
	Mean_AB
		All_Filtered
		All
		PubChem
		Chembl_Filtered
		Chembl
		PubChem_Filtered
	D_AB
		All_Filtered
		All
		PubChem
		Chembl_Filtered
		DrugBank_Filtered
		Chembl
		PubChem_Filtered
	S_AB
		Chembl_Filtered
		Chembl
Periphery
	Min_AB
	Mean_AB
		All_Filtered
		DrugBank
		PubChem
		DrugBank_Filtered
		Chembl
	D_AB
		All_Filtered
	S_AB
		DrugBank
		PubChem
		PubChem_Filtered


### Plot S_AB distribution

In [2]:
import seaborn as sns

In [11]:
targetLists = [f for f in os.listdir('../results/Check_Features/CheckBestTargetSet/') if os.path.isfile(os.path.join('../results/Check_Features/CheckBestTargetSet/', f)) and '.csv' in f]
distance_metric = {'D_AB':4, 'S_AB':5, 'Min_AB':6, 'Mean_AB':7}


metric = 'S_AB'
for targetList in targetLists:
    fp = open('../results/Check_Features/CheckBestTargetSet/' + targetList,'r')
    fp.next()
  
    all_values = []
    for line in fp:
        tmp = line.strip().split(',')
        value = tmp[distance_metric[metric]]


        if value != "None":
            all_values.append(float(value))
        
    print np.mean(all_values)

    plt.title(targetList.split('.')[0])
    #plt.yscale('log')
    #
    plt.fill([0, 0, max(all_values), max(all_values)], [0, 0.625, 0.625, 0], color='lightgrey', alpha=0.4)
    plt.hist(all_values,bins=12, density= True, color='#40B9D4',edgecolor="#40B9D4", linewidth=0.0, alpha=0.5)
   
    plt.xlabel('S_AB')
    plt.ylabel('Frequency')
    #plt.ylim([0.00000001,1])
    #plt.yscale('log', nonposy='clip')
    #plt.xscale('log')
    #plt.show()
    plt.yscale('log')
    plt.savefig('../results/Check_Features/CheckBestTargetSet/Results/S_AB_Distributions/'+targetList.split('.')[0]+'.pdf', format = 'pdf', dpi=800)
    plt.close()

0.6722009834273841
1.3609922810737909
0.6663973106768771
1.4210949885061646
0.515554244097155
0.6616415751265295
0.2801638381785182
1.4125882193782637
