In [1]:
import os
import glob
from pathlib import Path
import numpy as np
import bokeh.io
import bokeh.plotting
import bokeh.palettes
from bokeh.transform import jitter
import seaborn as sns
import matplotlib
from bokeh.models import HoverTool, Range1d
from scipy import stats
import pandas as pd
import math

from bokeh.layouts import row
bokeh.io.output_notebook()

In [2]:
def bootstrap_sampling(my_array, bootstrap_samples, bootstrap_replicates):
    bs_samples=np.zeros((bootstrap_replicates, bootstrap_samples))
    bs_rep=np.arange(bootstrap_replicates)
    for i in bs_rep:
        bs_samples[i, :]=np.random.choice(my_array, bootstrap_samples)
    return bs_samples

def bootstrap_stats(bs_samples):
    samples_shape=np.shape(bs_samples)
    bs_means=np.zeros((samples_shape[0], 1))
    bs_IC_means=np.zeros((1, 2))
    bs_medians=np.zeros((samples_shape[0], 1))
    bs_IC_medians=np.zeros((1, 2))
    for i in np.arange(samples_shape[0]):
        bs_means[i]=np.mean(bs_samples[i, :])
        bs_medians[i]=np.median(bs_samples[i, :])
    bs_IC_means[0, 0]=np.quantile(bs_means, .025)
    bs_IC_means[0, 1]=np.quantile(bs_means, .975)
    bs_IC_medians[0, 0]=np.quantile(bs_medians, .025)
    bs_IC_medians[0, 1]=np.quantile(bs_medians, .975)
    return np.mean(bs_means), np.median(bs_medians), bs_IC_means, bs_IC_medians

In [3]:
#set root folder
app_root_dir = os.path.join(Path.home(), "Desktop\git\PhD_codes\Mechanosensation\Python_code\Pole_analysis")
data_root_path = 'F:/Sauvegardes PhD/Covid-19/Pili_and_PaQa_counts_data/PaQa_data/fliC-'
#path='D:/Pili_and_PaQa_counts_data/fliC-'
#path='D:/Lorenzo/cpdA'

In [4]:
os.chdir(data_root_path)
extension = 'csv'
list_csv = []
for root, dirs, files in os.walk(data_root_path, topdown=False):
    for name in files:
        if extension in name:
            list_csv.append(os.path.join(root, name))
#os.chdir("C:/users/tala/git/PhD_codes/Mechanosensation/Python_code/Pole_analysis/")
os.chdir(app_root_dir)
new_dir = "Data_PaQa_fliC-_final\\"
if not os.path.exists(new_dir):
    os.mkdir(new_dir)
os.chdir(new_dir)
combined_csv = pd.concat([pd.read_csv(f) for f in list_csv ], sort=False)
raw_names=list(combined_csv.Label.unique())
#print(len(raw_names))
#print(raw_names)
Combine_Data=False
if not Combine_Data:
    combined_csv['Strain'] = 'nan'
    combined_csv['Growth'] = 'nan'
    for file in raw_names:
        #print(file)
        split_name=file.rsplit('_event',100)[0].rsplit('_',100)
        #print(split_name)
        if 'Agarose' in file or 'sol' in file or 'Agar' in file:
            #print('YES!!!')
            growth='Solid'
            if 'Agarose' in split_name:
                growthInd = split_name.index('Agarose')
            elif 'sol' in split_name:
                growthInd = split_name.index('sol')
            elif 'Agar' in split_name:
                growthInd = split_name.index('Agar')
        else:
            growth='Liquid'
            if 'liq' in split_name:
                growthInd = split_name.index('liq')
            else:
                growthInd = len(split_name)-3
        separator = '_'
        strain =  separator.join(split_name[0:growthInd])
        combined_csv.Strain.loc[(combined_csv['Label']==file)]=strain
        combined_csv.Growth.loc[(combined_csv['Label']==file)]=growth
combined_csv.to_csv( "Pili_PaQa_Data_fliC-_New.csv", index=False, encoding='utf-8-sig')
combined_csv.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,Unnamed: 1,Label,Area,Mean,Min,Max,X,Y,BiologicalReplicate,CellArea,...,MeanPoleBright,TotalFluorescencePoleBright,MinPoleBright,MaxPoleBright,StdPoleBright,Nb_Pili_PoleBright,Nb_Flagella_PoleBright,PolarRatio,Strain,Growth
0,1,fliC-_PaQa_Gasket_0_event11_tirf_RAW_Stack.tif,624,255,255,255,12.85256,36.78526,1,698,...,924.18248,126613.0,447,1372,255.4353,1,0,0.81639,fliC-,Liquid
1,2,fliC-_PaQa_Gasket_0_event11_tirf_RAW_Stack.tif,405,255,255,255,26.7963,14.51481,1,463,...,472.66071,52938.0,160,749,172.91593,0,0,0.82471,fliC-,Liquid
0,1,fliC-_PaQa_Gasket_0_event14_tirf_RAW_Stack.tif,700,255,255,255,42.52714,19.50143,1,785,...,489.30657,67035.0,190,696,145.71444,0,0,0.85555,fliC-,Liquid
0,1,fliC-_PaQa_Gasket_0_event15_tirf_RAW_Stack.tif,405,255,255,255,20.37901,26.53457,1,463,...,636.75,71316.0,251,922,204.80945,1,0,0.86161,fliC-,Liquid
0,1,fliC-_PaQa_Gasket_0_event17_tirf_RAW_Stack.tif,607,255,255,255,38.52636,14.36985,1,686,...,849.42336,116371.0,304,1410,303.13205,1,0,0.74817,fliC-,Liquid


In [5]:
new_dir = "Data_PaQa_fliC-_final"
os.chdir(os.path.join(app_root_dir, new_dir))
df_full = pd.read_csv("Pili_PaQa_Data_fliC-_New.csv", sep=',', na_values='*')
df_full['PercentTotalFluoDim']=df_full['TotalFluorescencePoleDim']/df_full['CellTotalFluorescence']
df_full['PercentTotalFluoBright']=df_full['TotalFluorescencePoleBright']/df_full['CellTotalFluorescence']
df_full['TotalPili']=df_full['Nb_Pili_PoleDim']+df_full['Nb_Pili_PoleBright']
df_full['TotalFlagella']=df_full['Nb_Flagella_PoleDim']+df_full['Nb_Flagella_PoleBright']
param1='CellTotalFluorescence'
param2='TotalPili'
poisson_lambda=df_full[param2].mean()

tot_pili_limit=7
#df=df_full.loc[(df_full[param2]<tot_pili_limit) & (df_full['Growth']=='Solid')]  
df=df_full.loc[(df_full[param2]<tot_pili_limit)]

Ncells_per_pili=np.empty((tot_pili_limit), dtype='int16')
N=len(df)
print('Ncells='+str(N), end=', ')
for i in range(tot_pili_limit):
    Ncells_per_pili[i]=len(df.loc[(df[param2]==i)])
    if (i<tot_pili_limit-1):
        print('N_'+str(i)+'pili='+str(Ncells_per_pili[i]), end=', ')
    else: print('N_'+str(i)+'pili='+str(Ncells_per_pili[i]))       
df.head()

Ncells=111, N_0pili=10, N_1pili=18, N_2pili=19, N_3pili=24, N_4pili=16, N_5pili=13, N_6pili=11


Unnamed: 0,Unnamed: 1,Label,Area,Mean,Min,Max,X,Y,BiologicalReplicate,CellArea,...,StdPoleBright,Nb_Pili_PoleBright,Nb_Flagella_PoleBright,PolarRatio,Strain,Growth,PercentTotalFluoDim,PercentTotalFluoBright,TotalPili,TotalFlagella
0,1,fliC-_PaQa_Gasket_0_event11_tirf_RAW_Stack.tif,624,255,255,255,12.85256,36.78526,1,698,...,255.4353,1,0,0.81639,fliC-,Liquid,0.163458,0.181909,3,0
1,2,fliC-_PaQa_Gasket_0_event11_tirf_RAW_Stack.tif,405,255,255,255,26.7963,14.51481,1,463,...,172.91593,0,0,0.82471,fliC-,Liquid,0.212794,0.223173,1,0
2,1,fliC-_PaQa_Gasket_0_event14_tirf_RAW_Stack.tif,700,255,255,255,42.52714,19.50143,1,785,...,145.71444,0,0,0.85555,fliC-,Liquid,0.148899,0.165582,1,0
3,1,fliC-_PaQa_Gasket_0_event15_tirf_RAW_Stack.tif,405,255,255,255,20.37901,26.53457,1,463,...,204.80945,1,0,0.86161,fliC-,Liquid,0.215917,0.230842,3,0
4,1,fliC-_PaQa_Gasket_0_event17_tirf_RAW_Stack.tif,607,255,255,255,38.52636,14.36985,1,686,...,303.13205,1,0,0.74817,fliC-,Liquid,0.159224,0.173028,5,0


In [6]:
nb_pili=np.zeros((tot_pili_limit, 1))
boot_mean=np.zeros((tot_pili_limit, 1))
boot_median=np.zeros((tot_pili_limit, 1))
boot_IC_mean=np.zeros((tot_pili_limit, 2))
boot_IC_median=np.zeros((tot_pili_limit, 2))
for n_pili in range(tot_pili_limit):
    cell_fluorescence_array=df.CellTotalFluorescence.loc[(df[param2]==n_pili)].values
    nb_pili[n_pili]=n_pili
    if (len(cell_fluorescence_array) > 0):
        bs_cell_fluorescence_array=bootstrap_sampling(cell_fluorescence_array, len(cell_fluorescence_array), 1000)
        [bs_means, bs_medians, IC_means, IC_medians]=bootstrap_stats(bs_cell_fluorescence_array)
        boot_mean[n_pili]=bs_means
        boot_IC_mean[n_pili,:]=IC_means
        boot_median[n_pili]=bs_medians
        boot_IC_median[n_pili,:]=IC_medians

names = [param2, 'bootMean','bootMedian']
data = np.concatenate((nb_pili, boot_mean, boot_median), axis=1)
df_boot_stats=pd.DataFrame(data=np.transpose(data), index=names).T
df_boot_stats['IC_mean']=list(boot_IC_mean)
df_boot_stats['IC_median']=list(boot_IC_median)
df_boot_stats.to_csv( "boot_Pili_PaQa_Data.csv", index=False, encoding='utf-8-sig')
df_boot_stats.head(100)

Unnamed: 0,TotalPili,bootMean,bootMedian,IC_mean,IC_median
0,0.0,313509.4,272299.5,"[221710.055, 422228.6649999999]","[196556.5, 395239.48749999976]"
1,1.0,645751.2,444462.0,"[437465.6736111111, 933463.1680555556]","[347440.0, 680282.0]"
2,2.0,613865.5,458015.0,"[423372.2026315789, 859071.9881578948]","[304316.0, 732782.0]"
3,3.0,554885.6,425175.0,"[417195.78125000006, 748939.6854166666]","[344902.0, 658045.0]"
4,4.0,766550.7,594554.5,"[531914.9890625, 1038385.2734374998]","[350062.0, 1135922.0]"
5,5.0,992830.8,852350.0,"[644659.2211538462, 1410448.5442307692]","[448312.0, 1122668.0]"
6,6.0,1109486.0,934219.0,"[852822.3295454546, 1511182.6477272725]","[777199.0, 1136354.0]"


In [12]:
param1='CellTotalFluorescence'
param2='TotalPili'
parameter1='bootMean'
parameter2='bootMedian'
IC1='IC_mean'
IC2='IC_median'
p3 = bokeh.plotting.figure(
    width=350, 
    height=300,  
    x_axis_type='linear',
    y_axis_type='linear',
    x_axis_label = '#Pili',
    y_axis_label = 'total fluorescence',
    title="Mean cell total fluorescence per pili nb (bootstrap mean, 95% IC)",
    x_range=Range1d(-0.25, tot_pili_limit-0.75),
    y_range=Range1d(0, 1600000)
)

In [13]:
for n_pili in list(df_boot_stats[param2]):
    b = [n_pili,n_pili]
    m=df_boot_stats.loc[(df_boot_stats[param2] == n_pili), [parameter1]].values[0][0]
    ic=df_boot_stats.loc[(df_boot_stats[param2] == n_pili), [IC1]].values[0][0]
    if (m-ic[0] < 0):
        a = [0, m+ic[1]]
    else: a = [m-ic[0], m+ic[1]]
    p3.line(
        x = b,
        y = ic,
        color = 'black',
        alpha=1,
        line_width=3
    )
    
p3.circle(
    source = df_boot_stats.loc[:, [param2, parameter1]],
    x = param2,
    y = parameter1,
    line_color = 'firebrick',
    fill_color = None,
    alpha=1,
    size=10,
    line_width=3
)

p3.output_backend = 'svg'
p3.xgrid.visible = False
p3.ygrid.visible = False
p3.xaxis.minor_tick_line_color = None
p3.yaxis.minor_tick_line_color = None

bokeh.io.show(p3)

In [18]:
print(parameter1+' vs '+param2+':')

[spearman_r, spearman_p]=stats.spearmanr(df_boot_stats[parameter1], df_boot_stats[param2])
print('Spearman correlation = '+str(spearman_r)+', p-value = '+ str(spearman_p))

[pearson_r, pearson_p]=stats.pearsonr(df_boot_stats[parameter1], df_boot_stats[param2])
print('Pearson correlation = '+str(pearson_r)+', p-value = '+str(pearson_p))

bootMean vs TotalPili:
Spearman correlation = 0.8571428571428573, p-value = 0.01369732661532562
Pearson correlation = 0.9244197000058165, p-value = 0.002895079335396471


In [14]:
p4 = bokeh.plotting.figure(
    width=350, 
    height=300,  
    x_axis_type='linear',
    y_axis_type='linear',
    x_axis_label = '#Pili',
    y_axis_label = 'total fluorescence',
    title="Mean cell total fluorescence per pili nb (bootstrap mean, 95% IC)",
    x_range=Range1d(-0.25, tot_pili_limit-0.75),
    y_range=Range1d(0, 3000000)
)


p4.circle(
    source=df,
    x=param2,
    y=param1, 
    line_color = 'blue',
    fill_color = None,
    alpha=0.7,
    #legend = labelsAll[i]
)

for n_pili in list(df_boot_stats[param2]):
    b = [n_pili,n_pili]
    m=df_boot_stats.loc[(df_boot_stats[param2] == n_pili), [parameter1]].values[0][0]
    ic=df_boot_stats.loc[(df_boot_stats[param2] == n_pili), [IC1]].values[0][0]
    if (m-ic[0] < 0):
        a = [0, m+ic[1]]
    else: a = [m-ic[0], m+ic[1]]
    p4.line(
        x = b,
        y = ic,
        color = 'black',
        alpha=1,
        line_width=3
    )
    
p4.circle(
    source = df_boot_stats.loc[:, [param2, parameter1]],
    x = param2,
    y = parameter1,
    line_color = 'firebrick',
    fill_color = None,
    alpha=1,
    size=10,
    line_width=3
)

p4.output_backend = 'svg'
p4.xgrid.visible = False
p4.ygrid.visible = False
p4.xaxis.minor_tick_line_color = None
p4.yaxis.minor_tick_line_color = None

bokeh.io.show(p4)

In [17]:
print(param1+' vs '+param2+':')

[spearman_r, spearman_p]=stats.spearmanr(df[param1], df[param2])
print('Spearman correlation = '+str(spearman_r)+', p-value = '+ str(spearman_p))

[pearson_r, pearson_p]=stats.pearsonr(df[param1], df[param2])
print('Pearson correlation = '+str(pearson_r)+', p-value = '+str(pearson_p))

CellTotalFluorescence vs TotalPili:
Spearman correlation = 0.4231092826168442, p-value = 3.7192156322166497e-06
Pearson correlation = 0.3418359180912121, p-value = 0.00024070374546690052
