In [14]:
import os
import glob
import pandas as pd
from pathlib import Path
import numpy as np
from scipy import stats
import math
import time

In [2]:
#Functions
def upgradeCSV(data_dir, app_root_dir, app_data_dir, save_folder, filename_distinct_substring):
    extension = 'csv'
    list_csv = []
    for root, dirs, files in os.walk(data_dir, topdown = False):
        for name in files:
            if extension in name:
                if filename_distinct_substring in name:
                    list_csv.append(os.path.join(root, name))
    os.chdir(app_root_dir)
    
    if not os.path.exists(app_data_dir):
        os.mkdir(app_data_dir)
    os.chdir(app_data_dir)
    new_files_dir = save_folder
    if not os.path.exists(new_files_dir):
        os.mkdir(new_files_dir)
    os.chdir(new_files_dir)
    for file in list_csv:
        df_temp = pd.read_csv(file, sep = ',', na_values = '*')
        file_temp=file.rsplit('\\',1)[1]
        if "BR" in file_temp:
            BioRep = file_temp.rsplit('BR',1)[1].rsplit('.',1)[0]
            if "=" in BioRep:
                BioRep = BioRep.rsplit('=',1)[1]
        else:
            BioRep = 1
        names = file_temp.rsplit('_',10)
        indices = [i for i, s in enumerate(names) if 'statistics' in s]
        names.pop(indices[0])
        df_temp['Strain'] = names[0]
        df_temp['Reporter'] = names[1]
        df_temp['TimePoint'] = names[2].rsplit('h',1)[0]
        df_temp['TeckRep'] = BioRep
        df_temp.to_csv(file.rsplit('\\',1)[1], index = None, header = True)
        
def msd(xdata, ydata):
    r=np.sqrt(xdata**2+ydata**2)
    diff=np.diff(r)
    diff_sq=diff**2
    MSD=np.mean(diff_sq)
    return MSD

def rmsd(xdata, ydata):
    xdata_x0=np.subtract(xdata, xdata[0])
    x=xdata_x0[1:]
    ydata_y0=np.subtract(ydata, ydata[0])
    y=ydata_y0[1:]
    RMSD=np.sqrt(np.mean(x**2+y**2))
    return RMSD

def bootstrap_sampling(my_array, bootstrap_samples, bootstrap_replicates):
    bs_samples=np.zeros((bootstrap_replicates, bootstrap_samples))
    bs_rep=np.arange(bootstrap_replicates)
    for i in bs_rep:
        bs_samples[i, :]=np.random.choice(my_array, bootstrap_samples)
    return bs_samples

def bootstrap_stats(bs_samples):
    samples_shape=np.shape(bs_samples)
    bs_means=np.zeros((samples_shape[0], 1))
    bs_IC_means=np.zeros((1, 2))
    bs_medians=np.zeros((samples_shape[0], 1))
    bs_IC_medians=np.zeros((1, 2))
    for i in np.arange(samples_shape[0]):
        bs_means[i]=np.mean(bs_samples[i, :])
        bs_medians[i]=np.median(bs_samples[i, :])
    bs_IC_means[0, 0]=np.quantile(bs_means, .025)
    bs_IC_means[0, 1]=np.quantile(bs_means, .975)
    bs_IC_medians[0, 0]=np.quantile(bs_medians, .025)
    bs_IC_medians[0, 1]=np.quantile(bs_medians, .975)
    return np.mean(bs_means), np.median(bs_medians), bs_IC_means, bs_IC_medians, stats.sem(bs_medians)

In [3]:
#set root folder
#data_dir = "D:\\Motility\\"
#data_dir = "D:\\Motility_increase\\"
#app_root_dir = "C:\\Users\\tala\\Desktop\\git\\PhD_codes\\Mechanosensation\\Python_code\\Motility_increase\\"
#app_root_dir = "C:\\Users\\tala\\Desktop\\Python-Code\\AnalysisMotility\\"
save_folder = "upgraded_csv_files_new\\"
app_data_dir = "newest_data\\"

data_dir = os.path.join(Path.home(),"Desktop\Data_to_analyze\motility")
app_root_dir = os.path.join(Path.home(), "Desktop\git\PhD_codes\Mechanosensation\Python_code\Motility_increase")

In [3]:
#Upgrade CSV files and concatenate them
upgradeCSV(data_dir, app_root_dir, app_data_dir, save_folder, "Track")
upgradeCSV(data_dir, app_root_dir, app_data_dir, save_folder, "Spot")
#upgradeCSV(data_dir, app_root_dir, save_folder, "fluo", 1)

os.chdir(os.path.join(app_root_dir, app_data_dir, save_folder))
extension = 'csv'
list_track_csv = [os.path.join(root,f) for root,dirs,files in os.walk(os.path.join(app_root_dir, app_data_dir, save_folder)) for f in files if 'Track' in f]
list_spot_csv = [os.path.join(root,f) for root,dirs,files in os.walk(os.path.join(app_root_dir, app_data_dir, save_folder)) for f in files if 'Spot' in f]
#list_fluo_csv = [i for i in glob.glob('*fluo*.{}'.format(extension))]

combined_track_csv = pd.concat([pd.read_csv(f) for f in list_track_csv ], sort=False)
combined_spot_csv = pd.concat([pd.read_csv(f) for f in list_spot_csv ], sort=False)
#combined_fluo_csv = pd.concat([pd.read_csv(f) for f in list_fluo_csv ], sort=False)
os.chdir(os.path.join(app_root_dir, app_data_dir))
combined_track_csv.to_csv( "newTrackData.csv", index=False, encoding='utf-8-sig')
combined_spot_csv.to_csv( "newSpotData.csv", index=False, encoding='utf-8-sig')
#combined_fluo_csv.to_csv( "newFluoData.csv", index=False, encoding='utf-8-sig')

38
38


In [4]:
os.chdir(os.path.join(app_root_dir, app_data_dir))

df_spots_long = combined_spot_csv.copy()
df_tracks_long = combined_track_csv.copy()
#df_fluo = combined_fluo_csv.copy()
Strains=df_tracks_long['Strain'].unique()
Nb_bioRep = df_tracks_long['TeckRep'].unique()
Nb_timePoints = df_tracks_long['TimePoint'].unique() 
track_duration=40
track_max_speed=4

df_tracks = df_tracks_long.loc[(df_tracks_long['TRACK_DURATION']>track_duration) & (df_tracks_long['TRACK_MAX_SPEED']<track_max_speed), :].copy()
df_spots = pd.concat(df_spots_long.loc[(df_spots_long['Strain']==s) & (df_spots_long['TimePoint']==t) & (df_spots_long['TeckRep']==b) & (df_spots_long['TRACK_ID']==i)] for s in Strains for b in Nb_bioRep for t in Nb_timePoints for i in df_tracks.loc[(df_tracks['Strain'] == s) & (df_tracks['TeckRep'] == b) & (df_tracks['TimePoint'] == t) , 'TRACK_ID'])
df_spots.to_csv("newSpotData_short.csv", index=False, encoding='utf-8-sig')
df_tracks['MSD']=0.0
df_tracks['RMSD']=0.0
df_tracks['PaqaTotalFluorescence']='nan'
df_tracks['rfpTotalFluorescence']='nan'
df_tracks['PaQa_RFP_ratio']='nan'
df_tracks.to_csv( "newTrackData_short.csv", index=False, encoding='utf-8-sig')
#df_fluo.head()

In [4]:
os.chdir(os.path.join(app_root_dir, app_data_dir))

df_tracks = pd.read_csv("newTrackData_short.csv", sep = ',', na_values = '*')
df_spots = pd.read_csv("newSpotData_short.csv", sep = ',', na_values = '*')
Strains=df_tracks['Strain'].unique()
Nb_bioRep = df_tracks['TeckRep'].unique()
Nb_timePoints = df_tracks['TimePoint'].unique() 
print(Strains)
print(Nb_bioRep)
print(Nb_timePoints)

['cpdA-fliC-' 'cyaB-fliC-' 'fliC-' 'pilG-fliC-' 'pilH-fliC-' 'pilTU-fliC-']
[1 3 2]
[0 1 2 3]


In [6]:
start=time.time()
n=0
boot_strain=list()
boot_tp=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_br=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
print(np.shape(boot_tp))
boot_meanMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_medianMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_medianMSD_sem=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_meanRMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_medianRMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_medianRMSD_sem=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_IC_meanMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 2))
boot_IC_medianMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 2))
boot_IC_meanRMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 2))
boot_IC_medianRMSD=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 2))
boot_meanSpeed=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_medianSpeed=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_medianSpeed_sem=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 1))
boot_IC_meanSpeed=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 2))
boot_IC_medianSpeed=np.zeros((len(Strains)*len(Nb_timePoints)*len(Nb_bioRep), 2))

for s, strain in enumerate(Strains):
    print(strain)
    for b, br in enumerate(Nb_bioRep):
        print('Biological replicate: '+ str(br))
        for t, tp in enumerate(Nb_timePoints):
            track_IDs=list(df_tracks.TRACK_ID.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['TeckRep']==br)].unique())
            track_Label=list(df_spots.TRACK_ID.loc[(df_spots['Strain']==strain) & (df_spots['TimePoint']==tp) & (df_spots['TeckRep']==br)].unique())
            boot_strain.append(strain)
            boot_tp[n]=tp
            boot_br[n]=br
            if track_IDs:
                print('In Time point '+ str(tp)+' there are '+str(len(track_IDs))+' tracks in TrackData.csv and '+str(len(track_Label))+' in SpotsData.csv')
                for track in track_IDs:
                    xdata=df_spots.POSITION_X.loc[(df_spots['Strain']==strain) & (df_spots['TimePoint']==tp) & (df_spots['TeckRep']==br) & (df_spots['TRACK_ID']==track)].values
                    ydata=df_spots.POSITION_Y.loc[(df_spots['Strain']==strain) & (df_spots['TimePoint']==tp) & (df_spots['TeckRep']==br) & (df_spots['TRACK_ID']==track)].values
                    #paqa=df_fluo.PaqaTotalFluorescence.loc[(df_fluo['Strain']==strain) & (df_fluo['TimePoint']==tp) & (df_fluo['Track_ID']==track)].values
                    #rfp=df_fluo.rfpTotalFluorescence.loc[(df_fluo['Strain']==strain) & (df_fluo['TimePoint']==tp) & (df_fluo['Track_ID']==track)].values
                    #ratio=df_fluo.PaQa_RFP_ratio.loc[(df_fluo['Strain']==strain) & (df_fluo['TimePoint']==tp) & (df_fluo['Track_ID']==track)].values
                    MSD=msd(xdata, ydata)
                    RMSD=rmsd(xdata, ydata)
                    df_tracks.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['TRACK_ID']==track) & (df_tracks['TeckRep']==br), 'MSD']=MSD
                    df_tracks.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['TRACK_ID']==track) & (df_tracks['TeckRep']==br), 'RMSD']=RMSD
                    #if bool(paqa):
                        #df_tracks.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['Label']=='Track_'+str(track)), 'PaqaTotalFluorescence']=paqa
                        #df_tracks.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['Label']=='Track_'+str(track)), 'rfpTotalFluorescence']=rfp
                        #df_tracks.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['Label']=='Track_'+str(track)), 'PaQa_RFP_ratio']=ratio
                tracks_data_MSD=df_tracks.MSD.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['TeckRep']==br)].values
                bs_tracks_data_MSD=bootstrap_sampling(tracks_data_MSD, len(tracks_data_MSD), 1000)
                [bs_means, bs_medians, IC_means, IC_medians, bs_medians_sem]=bootstrap_stats(bs_tracks_data_MSD)
                boot_meanMSD[n]=bs_means
                boot_IC_meanMSD[n,:]=IC_means
                boot_medianMSD[n]=bs_medians
                boot_medianMSD_sem[n]=bs_medians_sem
                boot_IC_medianMSD[n,:]=IC_medians
                tracks_data_RMSD=df_tracks.RMSD.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['TeckRep']==br)].values
                bs_tracks_data_RMSD=bootstrap_sampling(tracks_data_RMSD, len(tracks_data_RMSD), 1000)
                [bs_means, bs_medians, IC_means, IC_medians, bs_medians_sem]=bootstrap_stats(bs_tracks_data_RMSD)
                boot_meanRMSD[n]=bs_means
                boot_IC_meanRMSD[n,:]=IC_means
                boot_medianRMSD[n]=bs_medians
                boot_medianRMSD_sem[n]=bs_medians_sem
                boot_IC_medianRMSD[n,:]=IC_medians
                tracks_data_meanSpeed=df_tracks.TRACK_MEAN_SPEED.loc[(df_tracks['Strain']==strain) & (df_tracks['TimePoint']==tp) & (df_tracks['TeckRep']==br)].values
                bs_tracks_data_meanSpeed=bootstrap_sampling(tracks_data_meanSpeed, len(tracks_data_meanSpeed), 1000)
                [bs_means, bs_medians, IC_means, IC_medians, bs_medians_sem]=bootstrap_stats(bs_tracks_data_meanSpeed)
                boot_meanSpeed[n]=bs_means
                boot_IC_meanSpeed[n,:]=IC_means
                boot_medianSpeed[n]=bs_medians
                boot_medianSpeed_sem[n]=bs_medians_sem
                boot_IC_medianSpeed[n,:]=IC_medians
            else:
                boot_meanMSD[n]=np.nan
                boot_IC_meanMSD[n,:]=np.nan
                boot_medianMSD[n]=np.nan
                boot_medianMSD_sem[n]=np.nan
                boot_IC_medianMSD[n,:]=np.nan
                boot_meanRMSD[n]=np.nan
                boot_IC_meanRMSD[n,:]=np.nan
                boot_medianRMSD[n]=np.nan
                boot_medianRMSD_sem[n]=np.nan
                boot_IC_medianRMSD[n,:]=np.nan
                boot_meanSpeed[n]=np.nan
                boot_IC_meanSpeed[n,:]=np.nan
                boot_medianSpeed[n]=np.nan
                boot_medianSpeed_sem[n]=np.nan
                boot_IC_medianSpeed[n,:]=np.nan
            n=n+1
n=0
elapsed_time=(time.time()-start)/1000
print('Elsped time: ' + str(elapsed_time))
print('Done')

(72, 1)
cpdA-fliC-
Biological replicate: 1
In Time point 0 there are 309 tracks in TrackData.csv and 309 in SpotsData.csv
In Time point 1 there are 41 tracks in TrackData.csv and 41 in SpotsData.csv
In Time point 2 there are 116 tracks in TrackData.csv and 116 in SpotsData.csv
In Time point 3 there are 188 tracks in TrackData.csv and 188 in SpotsData.csv
Biological replicate: 3
In Time point 0 there are 308 tracks in TrackData.csv and 308 in SpotsData.csv
Biological replicate: 2
In Time point 0 there are 110 tracks in TrackData.csv and 110 in SpotsData.csv
cyaB-fliC-
Biological replicate: 1
In Time point 0 there are 208 tracks in TrackData.csv and 208 in SpotsData.csv
In Time point 1 there are 195 tracks in TrackData.csv and 195 in SpotsData.csv
In Time point 2 there are 189 tracks in TrackData.csv and 189 in SpotsData.csv
In Time point 3 there are 353 tracks in TrackData.csv and 353 in SpotsData.csv
Biological replicate: 3
In Time point 0 there are 207 tracks in TrackData.csv and 207 

In [7]:
names = ['TimePoint', 'meanMSD','medianMSD','medianMSD_sem', 'meanRMSD', 'medianRMSD', 'medianRMSD_sem', 'meanSpeed', 'medianSpeed', 'medianSpeed_sem', 'TeckRep']
data = np.concatenate((boot_tp, boot_meanMSD, boot_medianMSD, boot_medianMSD_sem, boot_meanRMSD, boot_medianRMSD, boot_medianRMSD_sem, boot_meanSpeed, boot_medianSpeed, boot_medianSpeed_sem, boot_br), axis=1)
df_boot_stats=pd.DataFrame(data=np.transpose(data), index=names).T
df_boot_stats['Strain']=boot_strain
df_boot_stats['IC_meanMSD']=list(boot_IC_meanMSD)
df_boot_stats['IC_medianMSD']=list(boot_IC_medianMSD)
df_boot_stats['IC_meanRMSD']=list(boot_IC_meanRMSD)
df_boot_stats['IC_medianRMSD']=list(boot_IC_medianRMSD)
df_boot_stats['IC_meanSpeed']=list(boot_IC_meanSpeed)
df_boot_stats['IC_medianSpeed']=list(boot_IC_medianSpeed)

df_tracks.to_csv( "TrackDataMSD.csv", index=False, encoding='utf-8-sig')
df_boot_stats.to_csv( "BootTrackData.csv", index=False, encoding='utf-8-sig')

df_boot_stats.head(100)

Unnamed: 0,TimePoint,meanMSD,medianMSD,medianMSD_sem,meanRMSD,medianRMSD,medianRMSD_sem,meanSpeed,medianSpeed,medianSpeed_sem,TeckRep,Strain,IC_meanMSD,IC_medianMSD,IC_meanRMSD,IC_medianRMSD,IC_meanSpeed,IC_medianSpeed
0,0.0,0.715259,0.677795,0.000538,12.179589,10.970938,0.010708,0.890914,0.906,0.000296,1.0,cpdA-fliC-,"[0.6859702270454511, 0.7452489904495498]","[0.6500281043717776, 0.7154316722002539]","[11.463934807765945, 12.936564020013863]","[10.42498174685319, 11.73108930945305]","[0.8701100362728155, 0.910776845156149]","[0.889, 0.927]"
1,1.0,0.657920,0.704575,0.001573,9.673686,8.295870,0.028950,0.803110,0.903,0.000948,1.0,cpdA-fliC-,"[0.543988830384986, 0.7696010111757754]","[0.6192910643537431, 0.7647707077731999]","[7.243551239185776, 12.35866143071531]","[7.201236445482817, 10.377286750336118]","[0.6934591463414633, 0.9021567073170732]","[0.878, 0.978]"
2,2.0,0.580419,0.638161,0.002048,8.063790,7.472005,0.034369,0.642196,0.900,0.001695,1.0,cpdA-fliC-,"[0.48207394620705124, 0.6842651107672195]","[0.507878666681852, 0.7378257161307643]","[6.737839345835681, 9.403327915929184]","[6.085768218404807, 9.933261403594452]","[0.5505433720293578, 0.726840416687974]","[0.8378500000000001, 0.9720249999999999]"
3,3.0,0.857204,0.842371,0.000861,11.631254,10.954370,0.023522,0.972256,1.021,0.000451,1.0,cpdA-fliC-,"[0.7990566991353754, 0.9206717481846912]","[0.771648266164312, 0.8814250943236394]","[10.763620820667247, 12.516478197227839]","[9.866105004622087, 12.482645292051656]","[0.9346003989361701, 1.0082457446808508]","[0.997, 1.048]"
4,0.0,0.290025,0.267520,0.000409,8.716548,7.972315,0.007678,0.567502,0.616,0.000481,3.0,cpdA-fliC-,"[0.26863644475609816, 0.31078750071375005]","[0.24278415190943026, 0.2883480096929979]","[8.154920939017119, 9.324252334576773]","[7.497416270543902, 8.370778156051983]","[0.542320616883117, 0.5914564123376622]","[0.5775, 0.6315]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,3.0,,,,,,,,,,3.0,pilTU-fliC-,"[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]"
68,0.0,0.003779,0.001549,0.000004,0.565660,0.378985,0.000664,0.049026,0.039,0.000056,2.0,pilTU-fliC-,"[0.003010121993518451, 0.004604530291118242]","[0.0013325268206914958, 0.0017960824533909037]","[0.5024683669089484, 0.634450233676268]","[0.34433997291799984, 0.4245452567159133]","[0.044960135135135144, 0.05293291505791506]","[0.036000000000000004, 0.043]"
69,1.0,,,,,,,,,,2.0,pilTU-fliC-,"[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]"
70,2.0,,,,,,,,,,2.0,pilTU-fliC-,"[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]"


In [6]:
%load_ext watermark


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [12]:
%watermark -v -p numpy,bokeh,jupyterlab,scipy,pandas

CPython 3.8.5
IPython 7.19.0

numpy 1.19.2
bokeh 2.2.3
jupyterlab 2.2.6
scipy 1.5.0
pandas 1.1.3
math unknown


In [18]:
test_1=np.zeros((2,4))

In [20]:
test_2=np.shape(test_1)
test_2[0]

2