# Analyse Perturbations over different Concentrations

## Libraries 

In [4]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from scipy import spatial
import os
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
from scipy.stats import mannwhitneyu

## Functions and definitions

In [5]:
# Some Easy Outlier detection
def reject_outliers_2(data, m=6.):
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d / (mdev if mdev else 1.)
    return [data[i] for i in range(0, len(data)) if s[i] < m]


def ensure_dir(file_path):
    '''
    Function to ensure a file path exists, else creates the path

    :param file_path:
    :return:
    '''
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
        
########################################
##Actual Math for calculating the DDIs##
########################################


def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)
def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::

            angle_between((1, 0, 0), (0, 1, 0))
            1.5707963267948966
            angle_between((1, 0, 0), (1, 0, 0))
            0.0
            angle_between((1, 0, 0), (-1, 0, 0))
            3.141592653589793
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))
def calculate_vector_math_v2(a, b, c):
    '''
    calculate the amount of single a, single b, and the 'surprise factor)
    :param a: vector a (single)
    :param b: vector b (single)
    :param c: vector c (combination)
    :return: alpha, beta and gamma (part of vector a, b and suprise)
    '''

    if sum(a) != 0 and sum(b) != 0:

        
        #Check first for two special cases:
        
        # 1.
        #Check if angle between the two vectors is to narrow (both point in same direction)
        if angle_between(a,b) <= 0.5:
            #h = c/(a+b)

            A=  np.array([[np.dot(a, a), np.dot(a, b)], [np.dot(a, b), np.dot(b, b)]])
            h = np.array([np.dot(a, c), np.dot(b, c)])

            alpha = h[0] / (A[0][0]+A[1][0])
            beta = h[1]  / (A[0][1]+A[1][1])

            n =alpha * a + beta * b - c

            gamma = np.linalg.norm(n)
            return str(alpha),str(beta),str(gamma)
        
        # 2. 
        #Check if angle between two is EXACTLY 180 degrees
        elif angle_between(a,b) == 3.141592653589793:
            A = np.array([[np.dot(a, a), np.dot(a, b)], [np.dot(a, b), np.dot(b, b)]])
            h = np.array([np.dot(a, c), np.dot(b, c)])


            alpha = h[0] / (A[0][0] + abs(A[1][0]))
            beta = h[1]  / (abs(A[0][1]) + A[1][1])

            n = alpha * a + beta * b - c

            gamma = np.linalg.norm(n)

            return str(alpha), str(beta), str(gamma)

    try:

        #
        # Check if all or any of the 3 vectors is zero
        
        if sum(c) != 0 and sum(a) == 0 and sum(b) == 0:
            return '1','1',str(dis.euclidean([0]*len(c),c))
        elif sum(c) == 0 and sum(a) == 0 and sum(b) == 0:
            return '1','1','0'
        elif sum(c) == 0 and sum(a) == 0:
            return '1','0','0'
        elif sum(c) == 0 and sum(b) == 0:
            return '0','1','0'

        else:
            
            
            # Matrix equation
            A = np.array([[np.dot(a, a), np.dot(a, b)], [np.dot(a, b), np.dot(b, b)]])

            h = np.array([np.dot(a, c), np.dot(b, c)])

            
            if A[0][0]==0 and h[0] ==0: #one vector zero, so combination can be only 1dim

                beta = h[1]/A[1][1]
                n = beta * b -c
                gamma = np.linalg.norm(n)

                if  (len(list(b)) - list(b).count(0) > 2) or np.linalg.norm(b) > 0.5:
                    return '1.0',str(beta),str(gamma)
                else:
                    return '1', '1', str(dis.euclidean([0] * len(c), c))
            elif A[1][1]==0 and h[1] ==0:
                alpha = h[0]/A[0][0]
                n = alpha * a -c
                gamma = np.linalg.norm(n)


                if len(list(a)) - list(a).count(0) > 2 or np.linalg.norm(a) > 0.5:
                    return str(alpha),'1',str(gamma)
                else:
                    return '1', '1', str(dis.euclidean([0] * len(c), c))
            elif h[0] == 0 and h[1] == 0:
                gamma = np.linalg.norm(c)
                return '0.0','0.0',str(gamma)
            elif A[0][0] != 0 and h[0] == 0 and h[1] == 0:
                gamma = np.linalg.norm(c)
                return '0.0','1.0',str(gamma)
            elif A[1][1] != 0 and h[0] == 0 and h[1] == 0:
                gamma = np.linalg.norm(c)
                return '1.0','0',str(gamma)

            
            ##########################
            # The ideal normal case with 3 vectors in correct angle and not being zero
            ##########################
            
            p = np.linalg.solve(A, h)
            # orthogonal vector
            n = p[0] * a + p[1] * b - c

            distance = np.linalg.norm(n)
            # check
            # print('dot product of a and c: %.4f' %(np.dot(a,n)))
            # print('dot product of b and c: %.4f' %(np.dot(b,n)))
            # print('distance: %.3f' %(distance))
            return str(p[0]), str(p[1]), str(distance)
    except:
        return 'Error', 'Error', 'Error'        


## Main functions

### Extract further metadata

In [6]:
per_well_CellCount = pd.read_csv('../results/IsabelCLOUPAC_Per_Image/CellCount/Individual_Well_Results.csv')
per_well_CellCount.Plate.astype(str)
per_well_CellCount.head()

Unnamed: 0,ID_A,Plate,Well,CellCount,TransferOK
0,A2169,1829001,A01,698,True
1,A6011,1829002,A01,566,True
2,A9256,1829003,A01,521,True
3,A8981,1829004,A01,498,True
4,C9911,1829005,A01,211,True


In [7]:
list_of_clouds = []
fp = open('../results/IsabelCLOUPAC_Per_Image/List_of_CLOUD_Drugs/List_of_usable_IDs.csv','r')
for line in fp:
    list_of_clouds.append(line.strip())
fp.close()
print len(list_of_clouds)

267


In [8]:
perturbations = pd.read_csv('../results/IsabelCLOUPAC_Per_Image/PerturbationVectors/Vectors.csv')

In [9]:
features = perturbations.columns[1:].values

In [10]:
#Drug name contains: name, concentration, well and plate information
perturbations['Drug'] = perturbations['Perturbation'].str.split('|').str[0]
perturbations['Concentrations'] = perturbations['Perturbation'].str.split('|').str[1].str.split('_').str[0]
perturbations['Well'] = perturbations['Perturbation'].str.split('|').str[1].str.split('_').str[1]
perturbations['Plate'] = perturbations['Perturbation'].str.split('|').str[1].str.split('_').str[2]

In [11]:
perturbations.head()

Unnamed: 0,Perturbation,Mean_Cytoskeleton_AreaShape_Eccentricity,Mean_Cytoskeleton_AreaShape_MaximumRadius,Mean_Cytoskeleton_AreaShape_Zernike_3_1,Mean_Cytoskeleton_AreaShape_Zernike_4_2,Mean_Cytoskeleton_Granularity_4_BetaTubulin,Mean_Cytoskeleton_Granularity_5_BetaTubulin,Mean_Cytoskeleton_Granularity_6_BetaTubulin,Mean_Cytoskeleton_Granularity_8_BetaTubulin,Mean_Cytoskeleton_Intensity_IntegratedIntensity_Mitotracker,...,Mean_Nuclei_AreaShape_Zernike_8_8,Mean_Nuclei_Granularity_2_DAPI,Mean_Nuclei_Granularity_3_DAPI,Mean_Nuclei_Intensity_MassDisplacement_DAPI,Mean_Nuclei_Intensity_MinIntensity_DAPI,Mean_Nuclei_Intensity_StdIntensityEdge_DAPI,Drug,Concentrations,Well,Plate
0,120693|0.02_G13_1829017,0.063411,-0.049249,0.044255,0.015243,0.097591,0.145258,0.037609,0.074874,-0.001857,...,0.195424,0.233047,0.111228,0.010828,-0.044514,0.072864,120693,0.02,G13,1829017
1,120693|0.2_B06_1829017,0.050786,-0.079571,-0.016638,-0.019028,0.104362,0.127354,0.143605,0.320105,-0.02239,...,0.038636,0.056243,0.081023,-0.014735,-0.045253,0.135346,120693,0.2,B06,1829017
2,120693|0.2_L04_1829001,-0.022872,-0.018703,-0.02307,-0.006471,-0.009668,-0.005219,-0.007479,-0.011005,-0.099382,...,-0.013884,-0.072578,-0.054055,0.022848,0.058465,0.017875,120693,0.2,L04,1829001
3,120693|2.0_D23_1829001,-0.065445,-0.015356,0.087342,-0.014087,-0.040125,-0.081301,-0.07329,-0.077236,0.164138,...,0.014734,0.213475,0.023246,-0.050042,-0.137266,-0.118359,120693,2.0,D23,1829001
4,120693|2.0_M03_1829017,-0.069925,0.052093,0.193876,-0.049382,-0.038983,-0.04367,-0.157009,-0.115575,-0.000306,...,-0.098884,-0.00098,-0.080114,-0.006963,0.00594,-0.020755,120693,2.0,M03,1829017


In [12]:
perturbations["Plate"] = pd.to_numeric(perturbations["Plate"])
per_well_CellCount["Plate"] = pd.to_numeric(per_well_CellCount["Plate"])


perturbations = pd.merge(perturbations, per_well_CellCount,  how='left', left_on=['Well','Plate'], right_on = ['Well','Plate'])
perturbations.head(10)

Unnamed: 0,Perturbation,Mean_Cytoskeleton_AreaShape_Eccentricity,Mean_Cytoskeleton_AreaShape_MaximumRadius,Mean_Cytoskeleton_AreaShape_Zernike_3_1,Mean_Cytoskeleton_AreaShape_Zernike_4_2,Mean_Cytoskeleton_Granularity_4_BetaTubulin,Mean_Cytoskeleton_Granularity_5_BetaTubulin,Mean_Cytoskeleton_Granularity_6_BetaTubulin,Mean_Cytoskeleton_Granularity_8_BetaTubulin,Mean_Cytoskeleton_Intensity_IntegratedIntensity_Mitotracker,...,Mean_Nuclei_Intensity_MassDisplacement_DAPI,Mean_Nuclei_Intensity_MinIntensity_DAPI,Mean_Nuclei_Intensity_StdIntensityEdge_DAPI,Drug,Concentrations,Well,Plate,ID_A,CellCount,TransferOK
0,120693|0.02_G13_1829017,0.063411,-0.049249,0.044255,0.015243,0.097591,0.145258,0.037609,0.074874,-0.001857,...,0.010828,-0.044514,0.072864,120693,0.02,G13,1829017,120693,375,True
1,120693|0.2_B06_1829017,0.050786,-0.079571,-0.016638,-0.019028,0.104362,0.127354,0.143605,0.320105,-0.02239,...,-0.014735,-0.045253,0.135346,120693,0.2,B06,1829017,120693,341,True
2,120693|0.2_L04_1829001,-0.022872,-0.018703,-0.02307,-0.006471,-0.009668,-0.005219,-0.007479,-0.011005,-0.099382,...,0.022848,0.058465,0.017875,120693,0.2,L04,1829001,120693,768,True
3,120693|2.0_D23_1829001,-0.065445,-0.015356,0.087342,-0.014087,-0.040125,-0.081301,-0.07329,-0.077236,0.164138,...,-0.050042,-0.137266,-0.118359,120693,2.0,D23,1829001,120693,515,True
4,120693|2.0_M03_1829017,-0.069925,0.052093,0.193876,-0.049382,-0.038983,-0.04367,-0.157009,-0.115575,-0.000306,...,-0.006963,0.00594,-0.020755,120693,2.0,M03,1829017,120693,380,True
5,120693|20.0_C07_1829001,0.02185,-0.05937,-0.033831,-0.011805,0.000626,-0.000523,-0.002613,0.01273,-0.008282,...,-0.007008,0.01003,-0.008575,120693,20.0,C07,1829001,120693,568,True
6,120693|20.0_E13_1829017,0.033735,-0.018476,-0.03912,0.030701,0.035227,0.022832,0.053039,0.135522,0.097436,...,0.010814,-0.010193,-0.031036,120693,20.0,E13,1829017,120693,419,True
7,144509|0.02_D06_1829018,0.035463,0.051001,-0.035585,0.056229,0.018553,0.009505,-0.001158,0.016618,0.001813,...,0.012725,-0.018673,0.013646,144509,0.02,D06,1829018,144509,790,True
8,144509|0.02_E10_1829001,-0.053438,-0.097794,-0.085296,-0.006524,0.002123,-0.004304,-0.016878,-0.062485,-0.115,...,-0.013924,-0.105886,-0.05194,144509,0.02,E10,1829001,144509,778,True
9,144509|0.2_A20_1829018,-0.020105,0.077641,0.002737,0.078359,-0.055669,-0.028792,-0.020073,-0.072952,0.018186,...,-0.008797,0.024222,-0.100698,144509,0.2,A20,1829018,144509,845,True


In [13]:
# INCLUDE ONLY CLOUDS
### 
perturbations = perturbations.loc[perturbations['Drug'].isin(list_of_clouds)]
print len(perturbations)

2057


In [14]:
# INCLUDE Wells with more than 50 cells
### 
perturbations = perturbations.loc[perturbations['CellCount'] > 25]
print len(perturbations)

2030


### Calculate vector norms

In [15]:
#CLOUD139 had a transfer problem (hence excluded form the very begining)


found_CLOUDS = perturbations['Drug'].unique()
for cloud in list_of_clouds:
    #if len(perturbations.loc[perturbations['Drug'] ==cloud]) < 8:
    #    print 'CLOUD with not all replicates: %s' %cloud
    if cloud not in found_CLOUDS:
        print 'Not found: %s' %cloud

Not found: CLOUD139


In [16]:
perturbations.head(10)

Unnamed: 0,Perturbation,Mean_Cytoskeleton_AreaShape_Eccentricity,Mean_Cytoskeleton_AreaShape_MaximumRadius,Mean_Cytoskeleton_AreaShape_Zernike_3_1,Mean_Cytoskeleton_AreaShape_Zernike_4_2,Mean_Cytoskeleton_Granularity_4_BetaTubulin,Mean_Cytoskeleton_Granularity_5_BetaTubulin,Mean_Cytoskeleton_Granularity_6_BetaTubulin,Mean_Cytoskeleton_Granularity_8_BetaTubulin,Mean_Cytoskeleton_Intensity_IntegratedIntensity_Mitotracker,...,Mean_Nuclei_Intensity_MassDisplacement_DAPI,Mean_Nuclei_Intensity_MinIntensity_DAPI,Mean_Nuclei_Intensity_StdIntensityEdge_DAPI,Drug,Concentrations,Well,Plate,ID_A,CellCount,TransferOK
220,A129|0.02_K18_1829019,0.089451,0.016575,0.041551,0.015026,0.025984,0.013652,0.011043,-0.017022,0.014251,...,0.000379,-0.0349,-0.042187,A129,0.02,K18,1829019,A129,711,True
221,A129|0.02_M02_1829002,-0.050783,0.153671,-0.02009,-0.001405,-0.015921,-0.01466,-0.057848,-0.077645,0.163583,...,-0.071771,-0.005105,-0.018747,A129,0.02,M02,1829002,A129,691,True
222,A129|0.2_A06_1829002,-0.053192,0.026584,-0.043148,-0.102708,-0.005831,-0.02937,-0.036359,-0.061846,0.11911,...,0.002921,-0.165102,-0.077421,A129,0.2,A06,1829002,A129,521,True
223,A129|0.2_F09_1829019,0.006099,0.00329,-0.003624,0.033667,-0.00268,-0.019738,-0.002385,0.050129,0.025878,...,-0.052117,0.013234,0.046052,A129,0.2,F09,1829019,A129,766,True
224,A129|2.0_G21_1829019,-0.001191,-0.031533,-0.021605,0.02325,-0.005175,0.000218,-0.004566,-0.062285,-0.022585,...,0.026292,-0.02338,0.01797,A129,2.0,G21,1829019,A129,903,True
225,A129|2.0_P13_1829002,-0.020484,-0.018663,0.000177,0.080541,-0.029704,-0.026959,-0.057215,-0.078781,-0.041621,...,-0.001303,0.011422,0.026368,A129,2.0,P13,1829002,A129,729,True
226,A129|20.0_E15_1829019,0.031995,-0.059124,0.002897,0.04709,0.021394,0.015197,0.017175,0.143626,-0.00746,...,-0.177759,0.177294,0.070658,A129,20.0,E15,1829019,A129,828,True
227,A129|20.0_G15_1829002,-0.065363,-0.005066,0.021843,-0.030966,-0.033195,-0.02642,-0.049347,0.000616,-0.065476,...,-0.102926,0.236401,-0.005218,A129,20.0,G15,1829002,A129,766,True
308,A178|0.02_D06_1829001,0.062219,0.033968,0.002124,0.022384,0.025393,0.022614,0.045668,0.069393,-0.028211,...,-0.051214,0.171684,0.092619,A178,0.02,D06,1829001,A178,628,True
309,A178|0.02_M08_1829018,0.030187,-0.002124,-0.056485,0.031341,-0.014888,-0.007783,0.005808,-0.044889,0.019966,...,0.031109,-0.019696,-0.013962,A178,0.02,M08,1829018,A178,792,True


In [17]:
# CALCULATE the actual vector norm and add to the pandas dataframe
##

perturbations['VectorNorm'] = np.linalg.norm(perturbations[features].values,axis=1)

### Find significant perturbations
- Use the mean over all vector norms + 0.25 standard deviations as a mild threshold for including only significant drug perturbations

In [129]:
# Calculate threshold
##
threshold_significant = perturbations['VectorNorm'].mean() + 0.5 *perturbations['VectorNorm'].std()
#threshold_significant = 0.8

# Create a histogram showing all drugs being larger than the threshold
###

plt.hist(perturbations['VectorNorm'].values,bins='auto', color='#3AB9D1')
plt.axvline(threshold_significant, ls='--', color='red')
plt.xlabel('Vector Norm')
plt.ylabel('Frequency')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Significant_Perturbations.pdf')
plt.close()

# Show Output
##
print 'Number significant perturbations: %d' %len(perturbations.loc[perturbations['VectorNorm'] > threshold_significant])
print 'Number non significant perturbations: %d' %len(perturbations.loc[perturbations['VectorNorm'] <= threshold_significant])


print 'Significance Threshold: %.2f' %threshold_significant
print 'Number significant Perturbations: %.2f' %threshold_significant

Number significant perturbations: 301
Number non significant perturbations: 1729
Significance Threshold: 0.79
Number significant Perturbations: 0.79


In [130]:
# Number of valid drugs
##
all_drugs = perturbations['Drug'].unique()
print 'Number of drugs: %d' %len(all_drugs)

Number of drugs: 266


## Check if result for same drug concentrations is more similar than what expected from random
The idea is that if the concentration of the same drug is just increased, the effect should just change linear. Meaning that the two vectors should still both point in the same direction (cosine similarity/ alpha/beta) but with an incresed effect (delta vector norm)

- Check the Cosine similarity (measure of similarity/linearity)
- Delta vector norm  (measure of increase of effect size)
- alpha/beta contribution (another measure of linearity given by the novel mathematical framework)

### Calculate the real results

In [162]:
## Reald draws
####


# save results in vectors (consecutive concentrations)
all_AlphaBetas = [] #alpha/beta
all_Gammas = [] #gamma
all_cosine_distances = [] #cosine simialrity (actually similarity --> see below)
all_delta_VectorNorms = [] #Dehltave vector norm

### additional test variable
all_gamma_explained = []

# save results in vectors (consecutive concentrations
all_AlphaBetas_replicates = [] #alpha/beta
all_Gammas_replicates = [] #gamma
all_cosine_distances_replicates = [] #cosine simialrity (actually similarity --> see below)
all_delta_VectorNorms_replicates = [] #Dehltave vector norm
all_Gammas_explained_replicates = []

#save the amount of significant concentrations for each drug
num_concentrations = []




#go through all drugs
for d in all_drugs:
    
    #Find the amount of concentrations screened for this drug (sometimes the concentration are not exactly 0.02 but 0.01999)
    used_concentrations = perturbations.loc[perturbations['Drug'] == d]['Concentrations'].unique()
    
    #sort from smallest to largest
    used_concentrations.sort()

    ##find the valid vectors i.e. vectors with norm bigger than previously introduced threshold
    valid_vectors = {}
    valid_vector_norms = {}
    for conc in used_concentrations:
    
        if perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].mean() > threshold_significant:
        
            valid_vectors[conc] = np.array(perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].mean().values)
            valid_vector_norms[conc] = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].mean()
    
            # Make calculations for same replicates
            if len(perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features]) == 2:

                vector1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[0]
                vector2 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[1]

                vectorNorm1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[0]
                vectorNorm2 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[1]

                if vectorNorm1 > threshold_significant and vectorNorm2 > threshold_significant:

                    #calculate cosine similarity results
                    cosine_similarity = 1 - spatial.distance.cosine(vector1, vector2)

                    #calculate vector norm result
                    delta_VectorNorm = vectorNorm1 - vectorNorm2      

                    #add results to result lists
                    all_cosine_distances_replicates.append(cosine_similarity)
                    all_delta_VectorNorms_replicates.append(delta_VectorNorm)

                    #perform vector math, use half vectors to not extend to to feaute values that might be not possible (e.g. Eccentricty  0.6 + 0.6 > 1)
                    a,b,g = calculate_vector_math_v2(vector1/2, vector1/2, vector2)

                    #add results to result lists
                    all_AlphaBetas_replicates.append(float(a))
                    all_Gammas_replicates.append(float(g))
                    all_Gammas_explained_replicates.append(float(g)/vectorNorm2)
                
               
                
            

    
    
    #add the number of valid vectors to num_concentration result list
    num_concentrations.append(len(valid_vectors))
    
    # go through the valid concentrations and always pick the consecutive ones
    for i in range(0,len(used_concentrations) - 1):
        
        # take the two concentrations that follow each other
        conc1 = used_concentrations[i] 
        conc2 = used_concentrations[i+1]
        
        #only calculate results if both concentrations are within the valid vectors for this drug
        if conc1 in valid_vectors and conc2 in valid_vectors:
            
            #calculate cosine similarity results
            cosine_similarity = 1 - spatial.distance.cosine(valid_vectors[conc1], valid_vectors[conc2])

            #calculate vector norm result
            delta_VectorNorm = valid_vector_norms[conc2] - valid_vector_norms[conc1]      

            #add results to result lists
            all_cosine_distances.append(cosine_similarity)
            all_delta_VectorNorms.append(delta_VectorNorm)

            
            #print valid_vector_norms[conc1]     
            #print valid_vector_norms[conc2]
            #print valid_vectors[conc1][0:6]
            #print valid_vectors[conc2][0:6]
            #print calculate_vector_math_v2(valid_vectors[conc1]/2, valid_vectors[conc1]/2, valid_vectors[conc2])
            
            
            #perform vector math, use half vectors to not extend to to feaute values that might be not possible (e.g. Eccentricty  0.6 + 0.6 > 1)
            
            #if valid_vector_norms[conc1] < valid_vector_norms[conc2]: 
            a,b,g = calculate_vector_math_v2(valid_vectors[conc1]/2, valid_vectors[conc1]/2, valid_vectors[conc2])
            #else:
            #    a,b,g = calculate_vector_math_v2(valid_vectors[conc2]/2, valid_vectors[conc2]/2, valid_vectors[conc1])

            #add results to result lists
            all_AlphaBetas.append(float(a))
            all_Gammas.append(float(g))
            all_gamma_explained.append(float(g)/valid_vector_norms[conc2])
#Show output
print 'Finished real calculation' 
print 'Number of real entries: %d' %len(all_cosine_distances)

Finished real calculation
Number of real entries: 23


In [132]:
# Show output for significant concentrations per drug
##
print '0 concentrations significant: %d' %num_concentrations.count(0)
print '1 concentrations significant: %d' %num_concentrations.count(1)
print '2 concentrations significant: %d' %num_concentrations.count(2)
print '3 concentrations significant: %d' %num_concentrations.count(3)
print '4 concentrations significant: %d' %num_concentrations.count(4)
print '-----'
print 'More than 1 significant: %d' %len([x for x in num_concentrations if x >1])

# Make bar plot
##

plt.bar([0,1,2,3,4],[num_concentrations.count(0),num_concentrations.count(1),num_concentrations.count(2),num_concentrations.count(3),num_concentrations.count(4)], color='#3AB9D1')
plt.xlabel('Significant Concentrations')
plt.ylabel('Frequency')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Significant_Perturbations_perDrug.pdf')
plt.close()

0 concentrations significant: 161
1 concentrations significant: 81
2 concentrations significant: 18
3 concentrations significant: 6
4 concentrations significant: 0
-----
More than 1 significant: 24


### Calculate random results
1. Go through all drugs
2. Find significant concentration for the first drug
3. Find all significant consecutive concentrations of all but the first drug
4. Do this for all drugs

In [158]:
# list of concentrations
next_concentrations = {'0.02':'0.2', '0.2':'2.0','2.0':'20.0'}

#Random result lists
all_AlphaBetas_random = []
all_Gammas_random = []
all_cosine_distances_random = []
all_delta_VectorNorms_random = []


all_Gammas_random_explained = []


drug_conc = perturbations.groupby(['Drug','Concentrations'], axis=0, as_index = False).mean()
drug_conc = drug_conc.loc[drug_conc['VectorNorm'] > threshold_significant][['Drug','Concentrations']]


iteration = 0
#go through all rows 
for row in drug_conc.iterrows():
    #select drug and concentrations
    drug1 = row[1][0]
    conc1_orig = row[1][1]
    
    #iteration +=1
    #if iteration > 2:
    #    break
    
    Vector_Length1 = perturbations.loc[(perturbations['Drug'] == drug1) & (perturbations['Concentrations'] == conc1_orig)]['VectorNorm'].mean()
    if Vector_Length1 > threshold_significant:
        vector1 = perturbations.loc[(perturbations['Drug'] == drug1) & (perturbations['Concentrations'] == conc1_orig)][features].mean().values
    
        #print drug1
        #print conc1_orig
        #print Vector_Length1
        #print vector1
        #print '-'
        
         #sometimes concentrations are not exactly on the four points (adjust to range)
        if float(conc1_orig) > 7:
            conc1 = '20.0'
        elif float(conc1_orig) > 0.7:
            conc1 = '2.0'
        elif float(conc1_orig) > 0.07:
            conc1 = '0.2'
        else:
            conc1 = '0.02'

        # if already max concentration go to next row
        if conc1 == '20.0':
            continue

        #find next concentration (consecutive)
        next_concentration = next_concentrations[conc1]
        
        #find all other valid drug/concentrations (must be not the same drug, and the EXACT following concentration)
        other_valid_drugs = drug_conc.loc[(drug_conc['Concentrations'] == next_concentration) & (drug_conc['Drug'] != drug1)]

        #Go through all thre rows of other significant drugs
        for row2 in other_valid_drugs.iterrows():
            #select other drug and concentration
            drug2 = row2[1][0]
            conc2 = row2[1][1]
                                
            Vector_Length2 = perturbations.loc[(perturbations['Drug'] == drug2) & (perturbations['Concentrations'] == conc2)]['VectorNorm'].mean()
            
            if Vector_Length2 > threshold_significant:
                vector2 = perturbations.loc[(perturbations['Drug'] == drug2) & (perturbations['Concentrations'] == conc2)][features].mean().values
                
                #print drug2
                #print conc2
                #print Vector_Length2
                #print vector2
                
                #calculate cosine similarity and delta vector norm
                cosine_similarity = 1 - spatial.distance.cosine(vector1, vector2)
                delta_VectorNorm = Vector_Length2 - Vector_Length1 

                #add results to result lists
                all_cosine_distances_random.append(cosine_similarity)
                all_delta_VectorNorms_random.append(delta_VectorNorm)

                #perform vector math
                #if Vector_Length1 < Vector_Length2:
                a,b,g = calculate_vector_math_v2(vector1/2, vector1/2, vector2)
                #else:
                #    a,b,g = calculate_vector_math_v2(vector2/2, vector2/2, vector1)

                #add results to result lists
                all_AlphaBetas_random.append(float(a))
                all_Gammas_random.append(float(g))
                all_Gammas_random_explained.append(float(g)/Vector_Length2)
                
                #print delta_VectorNorm
                #print g
                #print  calculate_vector_math_v2(vector1/2, vector1/2, vector2)
                #print calculate_vector_math_v2(vector2/2, vector2/2, vector1)
                #print 'xxx'
                
        #print '--'

#Show output
print 'Finished Randomization' 
print 'Number of random entries: %d' %len(all_cosine_distances_random)


Finished Randomization
Number of random entries: 1503


## Plot Results

### Cosine similarity and Delta vector norm

In [134]:
print 'Cosine Distance'
print 'Mean replicate results: %.2f' %np.mean(all_cosine_distances_replicates)
print 'Mean real results: %.2f' %np.mean(all_cosine_distances)
print 'Mean random results: %.2f' %np.mean(all_cosine_distances_random)
print mannwhitneyu(all_cosine_distances,all_cosine_distances_random) #Concentration/Random
print mannwhitneyu(all_cosine_distances,all_cosine_distances_replicates) #Concentration/Same
print mannwhitneyu(all_cosine_distances_replicates,all_cosine_distances_random) #Same/Random
print '--\n'
print 'Vector Distance'
print 'Mean replicate results: %.2f' %np.mean(all_delta_VectorNorms_replicates)
print 'Mean real results: %.2f' %np.mean(all_delta_VectorNorms)
print 'Mean random results: %.2f' %np.mean(all_delta_VectorNorms_random)
print mannwhitneyu(all_delta_VectorNorms,all_delta_VectorNorms_random) #Concentration/Random
print mannwhitneyu(all_delta_VectorNorms,all_delta_VectorNorms_replicates) #Concentration/Same
print mannwhitneyu(all_delta_VectorNorms_replicates,all_delta_VectorNorms_random) #replicates/Random

Cosine Distance
Mean replicate results: 0.68
Mean real results: 0.54
Mean random results: 0.16
MannwhitneyuResult(statistic=7036.0, pvalue=5.141338295336648e-07)
MannwhitneyuResult(statistic=547.0, pvalue=0.08815964643318747)
MannwhitneyuResult(statistic=6500.0, pvalue=4.3065848831173283e-29)
--

Vector Distance
Mean replicate results: 0.06
Mean real results: 0.61
Mean random results: 0.04
MannwhitneyuResult(statistic=12152.0, pvalue=0.007204751952009813)
MannwhitneyuResult(statistic=454.0, pvalue=0.010385638164073925)
MannwhitneyuResult(statistic=43694.0, pvalue=0.42485350532438704)


In [136]:
#Make output folder
ensure_dir('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_CosineSimilarity.pdf')

# Scatter plot Cosine similarityh against Delta vector norm (REAL)
##

plt.scatter(all_delta_VectorNorms,all_cosine_distances, alpha=0.4, color='#3AB9D1')
plt.axvline(0, color='grey', ls='--')
plt.axhline(0, color='grey', ls='--')
plt.xlabel('Delta Vector Norm')
plt.ylabel('Cosine Similarity')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Scatter_RealPoints.pdf')
plt.close()

In [137]:
# Scatter plot Cosine similarityh against Delta vector norm (RANDOM)
##

plt.scatter(all_delta_VectorNorms_random,all_cosine_distances_random, alpha=0.05, color='#3AB9D1')
plt.axvline(0, color='grey', ls='--')
plt.axhline(0, color='grey', ls='--')
plt.xlabel('Delta Vector Norm')
plt.ylabel('Cosine Similarity')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Scatter_RandomPoints.pdf')
plt.close()

In [138]:
# KDE (Density) plot with random results 
# + Scatter plot Cosine similarityh against Delta vector norm (REAL)
# Overlay
##

sns.kdeplot(all_delta_VectorNorms_random, all_cosine_distances_random, shade=True, shade_lowest=False, gridsize=200, bw=0.25, color='grey')
plt.scatter(all_delta_VectorNorms,all_cosine_distances, alpha=0.8, color='#3AB9D1')
plt.axvline(0, color='grey', ls='--')
plt.axhline(0, color='grey', ls='--')
plt.xlabel('Delta Vector Norm')
plt.ylabel('Cosine Similarity')
plt.ylim([-1,1])
plt.xlim([-2,3]) # CHECK THIS before 
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Density_RandomPoints.pdf')
plt.close()

In [139]:
# Bar plot difference in cosine similarity
##


plt.bar([0,1,2],[np.mean(all_cosine_distances_replicates),np.mean(all_cosine_distances), np.mean(all_cosine_distances_random)], 
        yerr = [1.96 * np.std(all_cosine_distances_replicates)/np.sqrt(len(all_cosine_distances_replicates))
            ,1.96 * np.std(all_cosine_distances)/np.sqrt(len(all_cosine_distances))
                ,1.96 * np.std(all_cosine_distances_random)/np.sqrt(len(all_cosine_distances_random))], color='#3AB9D1')
plt.xlabel('Group')
plt.xticks([0,1,2],['Replicates','Real','Random'])
plt.ylabel('Cosine Similarity [95% CI]')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_CosineSimilarity.pdf')
plt.close()

In [140]:
# Bar plot difference in delta vector norm
##


plt.bar([0,1,2],[np.mean(all_delta_VectorNorms_replicates), np.mean(all_delta_VectorNorms), np.mean(all_delta_VectorNorms_random)],
        yerr = [1.96 * np.std(all_delta_VectorNorms_replicates)/np.sqrt(len(all_delta_VectorNorms_replicates))
            ,1.96 * np.std(all_delta_VectorNorms)/np.sqrt(len(all_delta_VectorNorms))
                ,1.96 * np.std(all_delta_VectorNorms_random)/np.sqrt(len(all_delta_VectorNorms_random))], color='#3AB9D1')
plt.xlabel('Group')
plt.xticks([0,1,2],['Replicates','Real','Random'])
plt.ylabel('Delta Vector Norm [95% CI]')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_DeltaVectorNorm.pdf')
plt.close()

### Alpha/Beta vector math results

In [141]:
print 'Alpha Beta:'
print mannwhitneyu(all_AlphaBetas_replicates,all_AlphaBetas_random) #Replicate/Random
print mannwhitneyu(all_AlphaBetas,all_AlphaBetas_replicates) #Replicate/Concentration
print mannwhitneyu(all_AlphaBetas,all_AlphaBetas_random) #Concentration/Random
print 'Mean replicate results: %.2f' %np.mean(all_AlphaBetas_replicates)
print 'Mean real results: %.2f' %np.mean(all_AlphaBetas)
print 'Mean random results: %.2f' %np.mean(all_AlphaBetas_random)
print '--\n'
print 'Gamma:'
print mannwhitneyu(all_Gammas_random,all_Gammas_replicates) #Replicate/Random
print mannwhitneyu(all_Gammas,all_Gammas_replicates) #Replicate/Concentration
print mannwhitneyu(all_Gammas,all_Gammas_random) #Concentration/random
print 'Mean replicate results: %.2f' %np.mean(all_Gammas_replicates)
print 'Mean real results: %.2f' %np.mean(all_Gammas)
print 'Mean random results: %.2f' %np.mean(all_Gammas_random)

Alpha Beta:
MannwhitneyuResult(statistic=13517.0, pvalue=6.013820294982251e-20)
MannwhitneyuResult(statistic=605.0, pvalue=0.22557410090260943)
MannwhitneyuResult(statistic=6040.0, pvalue=4.137355052515443e-08)
Mean replicate results: 0.67
Mean real results: 0.89
Mean random results: 0.21
--

Gamma:
MannwhitneyuResult(statistic=41888.0, pvalue=0.23548720834232006)
MannwhitneyuResult(statistic=567.0, pvalue=0.12595216206800786)
MannwhitneyuResult(statistic=14375.0, pvalue=0.08272333729523074)
Mean replicate results: 1.04
Mean real results: 1.33
Mean random results: 1.12


In [142]:
# KDE (Density) plot with random results 
# + Scatter plot alpha/beta vs gamma (REAL)
# Overlay
##

sns.kdeplot(all_AlphaBetas_random, all_Gammas_random, shade=True, shade_lowest=False, gridsize=200, bw=0.25, color='grey')
plt.scatter(all_AlphaBetas,all_Gammas, alpha=0.8, color='#3AB9D1')
plt.axvline(0, color='grey', ls='--')
plt.axhline(0, color='grey', ls='--')
plt.xlabel('Alpha/Beta')
plt.ylabel('Gamma')
#plt.ylim([-1,1])
#plt.xlim([-2,3]) # CHECK THIS before 
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Density_AlphaBeta.pdf')
plt.close()

In [143]:
# Bar plot difference in alpha/beta contribution
##

plt.bar([0,1,2],[np.mean(all_AlphaBetas_replicates),np.mean(all_AlphaBetas), np.mean(all_AlphaBetas_random)], 
        yerr = [1.96 * np.std(all_AlphaBetas_replicates)/np.sqrt(len(all_AlphaBetas_replicates))
            ,1.96 * np.std(all_AlphaBetas)/np.sqrt(len(all_AlphaBetas))
                ,1.96 * np.std(all_AlphaBetas_random)/np.sqrt(len(all_AlphaBetas_random))], color='#3AB9D1')

plt.xlabel('Group')
plt.xticks([0,1,2],['Replicates','Real','Random'])
plt.ylabel('Alpha/Beta [95% CI]')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_AlphaBeta.pdf')
plt.close()

In [144]:
# Bar plot difference in gamma contribution
##

plt.bar([0,1,2],[np.mean(all_Gammas_replicates),np.mean(all_Gammas), np.mean(all_Gammas_random)], 
        yerr = [1.96 * np.std(all_Gammas_replicates)/np.sqrt(len(all_Gammas_replicates))
            ,1.96 * np.std(all_Gammas)/np.sqrt(len(all_Gammas))
                ,1.96 * np.std(all_Gammas_random)/np.sqrt(len(all_Gammas_random))], color='#3AB9D1')

plt.xlabel('Group')
plt.xticks([0,1,2],['Replicates','Real','Random'])
plt.ylabel('Gamma [95% CI]')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_Gamma.pdf')
plt.close()

In [145]:
# Scatter plot alpha/beta agains gamma (REAL)
##

plt.scatter(all_AlphaBetas,all_Gammas, alpha =0.4, color='#3AB9D1')
plt.xlabel('AlphaBeta')
plt.ylabel('Gamma')
plt.axvline(0, color='grey', ls='--')
plt.axhline(np.mean(all_Gammas_random), color='grey', ls='--')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Scatter_AlphaBeta_Gamma.pdf')
plt.close()

In [146]:
# Scatter plot alpha/beta agains gamma (RANDOM)
##

plt.scatter(all_AlphaBetas_random,all_Gammas_random, alpha =0.4, color='#3AB9D1')
plt.xlabel('AlphaBeta')
plt.ylabel('Gamma')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Scatter_AlphaBeta_Gamma_random.pdf')
plt.close()

In [147]:
# Gamma increases with increasing vector norms
# Normalize by subracting the part that is just contributed to increase in vector
##
gamma_normalized_replicates = []
for x,y in zip(all_delta_VectorNorms_replicates,all_Gammas_replicates):
    gamma_normalized_replicates.append(y-x)


gamma_normalized = []
for x,y in zip(all_delta_VectorNorms,all_Gammas):
    gamma_normalized.append(y-x)
    
    
gamma_normalized_random = []
for x,y in zip(all_delta_VectorNorms_random,all_Gammas_random):
    gamma_normalized_random.append(y-x)
    
print 'Gamma_norm:'
print mannwhitneyu(gamma_normalized,gamma_normalized_random)

Gamma_norm:
MannwhitneyuResult(statistic=12887.0, pvalue=0.018021219733995533)


In [148]:
# Plot gamma normalized differences between real and random
##

plt.bar([0,1,2],[np.mean(gamma_normalized_replicates), np.mean(gamma_normalized), np.mean(gamma_normalized_random)], 
        yerr = [1.96 * np.std(gamma_normalized_replicates)/np.sqrt(len(gamma_normalized_replicates)),
            1.96 * np.std(gamma_normalized)/np.sqrt(len(gamma_normalized)),
                1.96 * np.std(gamma_normalized_random)/np.sqrt(len(gamma_normalized_random))], color='#3AB9D1')

plt.xlabel('Group')
plt.xticks([0,1,2],['Replicates','Real','Random'])
plt.ylabel('Gamma - Delta Vector')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_Gamma_normalized.pdf')
plt.close()

In [164]:
plt.bar([0,1,2],[np.mean(all_Gammas_explained_replicates), np.mean(all_gamma_explained), np.mean(all_Gammas_random_explained)], 
        yerr = [1.96 * np.std(all_Gammas_explained_replicates)/np.sqrt(len(all_Gammas_explained_replicates)),
            1.96 * np.std(all_gamma_explained)/np.sqrt(len(all_gamma_explained)),
                1.96 * np.std(all_Gammas_random_explained)/np.sqrt(len(all_Gammas_random_explained))], color='#3AB9D1')

plt.xlabel('Group')
plt.xticks([0,1,2],['Replicates','Real','Random'])
plt.ylabel('Gamma/||Vec2||')
plt.savefig('../results/IsabelCLOUPAC_Per_Image/Analyse_Perturbations_over_Concentrations/Barplot_GammaExplained.pdf')
plt.close()

In [None]:
## Reald draws
####


# save results in vectors (consecutive concentrations)
all_AlphaBetas = [] #alpha/beta
all_Gammas = [] #gamma
all_cosine_distances = [] #cosine simialrity (actually similarity --> see below)
all_delta_VectorNorms = [] #Dehltave vector norm


# save results in vectors (consecutive concentrations
all_AlphaBetas_replicates = [] #alpha/beta
all_Gammas_replicates = [] #gamma
all_cosine_distances_replicates = [] #cosine simialrity (actually similarity --> see below)
all_delta_VectorNorms_replicates = [] #Dehltave vector norm


#save the amount of significant concentrations for each drug
num_concentrations = []




#go through all drugs
for d in all_drugs:
    
    #Find the amount of concentrations screened for this drug (sometimes the concentration are not exactly 0.02 but 0.01999)
    used_concentrations = perturbations.loc[perturbations['Drug'] == d]['Concentrations'].unique()
    
    #sort from smallest to largest
    used_concentrations.sort()

    ##find the valid vectors i.e. vectors with norm bigger than previously introduced threshold
    valid_vectors = {}
    valid_vector_norms = {}
    for conc in used_concentrations:
       
        
        
        
       # vector1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[0]
        #vector2 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[1]

        #vectorNorm1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[0]
        #vectorNorm2 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[1]
        
        #if vectorNorm1 > threshold_significant or vectorNorm1 > threshold_significant:
        #    if vectorNorm1 > threshold_significant and vectorNorm1 > threshold_significant:
        #        print vector1
        #        print vector2
        #        break
                
                
                #valid_vectors[conc] = 
                #valid_vector_norms[conc] = 
        
        #if perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].mean() > threshold_significant:
        
            #valid_vectors[conc] = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].mean().values
            #valid_vector_norms[conc] = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].mean()
    
        # Make calculations for same replicates
        if len(perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features]) == 2:

            vector1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[0]
            vector2 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[1]

            vectorNorm1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[0]
            vectorNorm2 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[1]

            if vectorNorm1 > threshold_significant and vectorNorm2 > threshold_significant:

                #calculate cosine similarity results
                cosine_similarity = 1 - spatial.distance.cosine(vector1, vector2)

                #calculate vector norm result
                delta_VectorNorm = vectorNorm1 - vectorNorm2      

                #add results to result lists
                all_cosine_distances_replicates.append(cosine_similarity)
                all_delta_VectorNorms_replicates.append(delta_VectorNorm)

                #perform vector math, use half vectors to not extend to to feaute values that might be not possible (e.g. Eccentricty  0.6 + 0.6 > 1)
                a,b,g = calculate_vector_math_v2(vector1/2, vector1/2, vector2)

                #add results to result lists
                all_AlphaBetas_replicates.append(float(a))
                all_Gammas_replicates.append(float(g))
                
                
                #add combined vector to valid vectors
                vector_combined = np.mean([vector1,vector2],axis=0)
                norm_combined = np.mean([vectorNorm1,vectorNorm2])
                valid_vectors[conc] = vector_combined
                valid_vector_norms[conc] = norm_combined
            
            elif vectorNorm1 > threshold_significant and vectorNorm2 < threshold_significant:
                valid_vectors[conc] = vector1
                valid_vector_norms[conc] = vectorNorm1
            elif vectorNorm1 < threshold_significant and vectorNorm2 > threshold_significant:
                valid_vectors[conc] = vector2
                valid_vector_norms[conc] = vectorNorm2
        
        elif len(perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features]) == 1:
                vector1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)][features].values[0]
                vectorNorm1 = perturbations.loc[(perturbations['Drug'] == d) & (perturbations['Concentrations'] == conc)]['VectorNorm'].values[0]
                
                if vectorNorm1 > threshold_significant:
                    valid_vectors[conc] = vector1
                    valid_vector_norms[conc] = vectorNorm1
                    
                
            

    
    
    #add the number of valid vectors to num_concentration result list
    num_concentrations.append(len(valid_vectors))
    
    # go through the valid concentrations and always pick the consecutive ones
    for i in range(0,len(used_concentrations) - 1):
        
        # take the two concentrations that follow each other
        conc1 = used_concentrations[i] 
        conc2 = used_concentrations[i+1]
        
        #only calculate results if both concentrations are within the valid vectors for this drug
        if conc1 in valid_vectors and conc2 in valid_vectors:
            
            #calculate cosine similarity results
            cosine_similarity = 1 - spatial.distance.cosine(valid_vectors[conc1], valid_vectors[conc2])
            
            #calculate vector norm result
            delta_VectorNorm = valid_vector_norms[conc2] - valid_vector_norms[conc1]      
            
            #add results to result lists
            all_cosine_distances.append(cosine_similarity)
            all_delta_VectorNorms.append(delta_VectorNorm)
            
            #perform vector math, use half vectors to not extend to to feaute values that might be not possible (e.g. Eccentricty  0.6 + 0.6 > 1)
            a,b,g = calculate_vector_math_v2(valid_vectors[conc1]/2, valid_vectors[conc1]/2, valid_vectors[conc2])
            
            #add results to result lists
            all_AlphaBetas.append(float(a))
            all_Gammas.append(float(g))
#Show output
print 'Finished real calculation' 
print 'Number of real entries: %d' %len(all_cosine_distances)

In [None]:
# list of concentrations
next_concentrations = {'0.02':'0.2', '0.2':'2.0','2.0':'20.0'}

#select all significant perturbations
significant_drug_concentrations = perturbations.loc[perturbations['VectorNorm']  > threshold_significant][['Drug','Concentrations']]


#Random result lists
all_AlphaBetas_random = []
all_Gammas_random = []

all_cosine_distances_random = []
all_delta_VectorNorms_random = []


#go through all rows 
for row in significant_drug_concentrations.iterrows():
    break
    #select drug and concentrations
    drug1 = row[1][0]
    conc1_orig = row[1][1]
    
    #sometimes concentrations are not exactly on the four points (adjust to range)
    if float(conc1_orig) > 7:
        conc1 = '20.0'
    elif float(conc1_orig) > 0.7:
        conc1 = '2.0'
    elif float(conc1_orig) > 0.07:
        conc1 = '0.2'
    else:
        conc1 = '0.02'
    
    # if already max concentration go to next row
    if conc1 == '20.0':
        continue
        
    #find next concentration (consecutive)
    next_concentration = next_concentrations[conc1]

    #find all other valid drug/concentrations (must be not the same drug, and the EXACT following concentration)
    other_valid_drugs = significant_drug_concentrations.loc[(significant_drug_concentrations['Concentrations'] == next_concentration) & (significant_drug_concentrations['Drug'] != drug1)]

    #Go through all thre rows of other significant drugs
    for row2 in other_valid_drugs.iterrows():

        #select other drug and concentration
        drug2 = row2[1][0]
        conc2 = row2[1][1]

        #select vectors
        vector1 = perturbations.loc[(perturbations['Drug'] == drug1) & (perturbations['Concentrations'] == conc1_orig)][features].mean().values
        vector2 = perturbations.loc[(perturbations['Drug'] == drug2) & (perturbations['Concentrations'] == conc2)][features].mean().values

        #select vector norms
        Vector_Length1 = perturbations.loc[(perturbations['Drug'] == drug1) & (perturbations['Concentrations'] == conc1_orig)]['VectorNorm'].mean()
        Vector_Length2 = perturbations.loc[(perturbations['Drug'] == drug2) & (perturbations['Concentrations'] == conc2)]['VectorNorm'].mean()

        
        #if Vector_Length1 > Vector_Length2:    

        #calculate cosine similarity and delta vector norm
        cosine_similarity = 1 - spatial.distance.cosine(vector1, vector2)
        delta_VectorNorm = Vector_Length1 - Vector_Length2

        #add results to result lists
        all_cosine_distances_random.append(cosine_similarity)
        all_delta_VectorNorms_random.append(delta_VectorNorm)

        #perform vector math
        a,b,g = calculate_vector_math_v2(vector1/2, vector1/2, vector2)

        #add results to result lists
        all_AlphaBetas_random.append(float(a))
        all_Gammas_random.append(float(g))

#Show output
print 'Finished Randomization' 
print 'Number of random entries: %d' %len(all_cosine_distances_random)
