# A notebook for collating the SC data

In [1]:
import btrack
import os
from macrohet import dataio, tools
import glob
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

### To do:

1. Check area Mtb measurements are in agreement
2. Homogenise method to measure area, dmtb, doubling time etc
3. Ensure continuity between minimum track length across expts

### Define scope of analysis

In [2]:
expts = ['PS0000', 'ND0002', 'ND0003']

In [5]:
error_IDs = []
# load tracks 
tracks_dict = dict()
for expt_ID in tqdm(expts):
    
    # extract metadata for each experiment
    base_dir = f'/mnt/SYNO/macrohet_syno/data/{expt_ID}/'
    metadata_fn = os.path.join(base_dir, 'acquisition/Images/Index.idx.xml')
    metadata = dataio.read_harmony_metadata(metadata_fn)  
    metadata_path = glob.glob(os.path.join(base_dir, 'acquisition/Assaylayout/*.xml'))[0]
    assay_layout = dataio.read_harmony_metadata(metadata_path, assay_layout=True,)

    
    ### iterate over all experimental conditions
    for (row, column), info in tqdm(assay_layout.iterrows(), 
                                    desc = 'Progress through positions',
                                    total = len(assay_layout)):
        try:
            # define paths dependent on model
            if expt_ID == 'PS0000':
                track_path = os.path.join(base_dir, f'labels/macrohet_seg_model/{row,column}.h5')
            else:
                track_path = os.path.join(base_dir, f'labels/cpv3/{row,column}.h5')

            ### load tracks
            with btrack.io.HDF5FileHandler(track_path, 
                                               'r', 
                                               obj_type='obj_type_1'
                                               ) as reader:
                tracks = reader.tracks
                    
            ### append tracks to dictionary
            tracks_dict[(expt_ID, row, column)] = tracks
        except:
            error_IDs.append((expt_ID, row, column))

  0%|          | 0/3 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


Progress through positions:   0%|          | 0/24 [00:00<?, ?it/s]

[INFO][2024/06/24 03:11:09 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/PS0000/labels/macrohet_seg_model/(3, 4).h5...
[INFO][2024/06/24 03:11:09 PM] Loading tracks/obj_type_1
[INFO][2024/06/24 03:11:09 PM] Loading LBEP/obj_type_1
[INFO][2024/06/24 03:11:10 PM] Loading objects/obj_type_1 (39878, 5) (39878 filtered: None)
[INFO][2024/06/24 03:11:10 PM] Closing HDF file: /mnt/SYNO/macrohet_syno/data/PS0000/labels/macrohet_seg_model/(3, 4).h5
[INFO][2024/06/24 03:11:10 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/PS0000/labels/macrohet_seg_model/(3, 5).h5...
[INFO][2024/06/24 03:11:10 PM] Loading tracks/obj_type_1
[INFO][2024/06/24 03:11:10 PM] Loading LBEP/obj_type_1
[INFO][2024/06/24 03:11:10 PM] Loading objects/obj_type_1 (41424, 5) (41424 filtered: None)
[INFO][2024/06/24 03:11:11 PM] Closing HDF file: /mnt/SYNO/macrohet_syno/data/PS0000/labels/macrohet_seg_model/(3, 5).h5
[INFO][2024/06/24 03:11:11 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/PS0000/labels/macrohet_s

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


Progress through positions:   0%|          | 0/42 [00:00<?, ?it/s]

[INFO][2024/06/24 03:12:08 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/ND0002/labels/cpv3/(3, 1).h5...
[INFO][2024/06/24 03:12:08 PM] Loading tracks/obj_type_1
[INFO][2024/06/24 03:12:08 PM] Loading LBEP/obj_type_1
[INFO][2024/06/24 03:12:08 PM] Loading objects/obj_type_1 (15525, 5) (15525 filtered: None)
[INFO][2024/06/24 03:12:08 PM] Closing HDF file: /mnt/SYNO/macrohet_syno/data/ND0002/labels/cpv3/(3, 1).h5
[INFO][2024/06/24 03:12:08 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/ND0002/labels/cpv3/(3, 2).h5...
[INFO][2024/06/24 03:12:08 PM] Loading tracks/obj_type_1
[INFO][2024/06/24 03:12:08 PM] Loading LBEP/obj_type_1
[INFO][2024/06/24 03:12:08 PM] Loading objects/obj_type_1 (15189, 5) (15189 filtered: None)
[INFO][2024/06/24 03:12:08 PM] Closing HDF file: /mnt/SYNO/macrohet_syno/data/ND0002/labels/cpv3/(3, 2).h5
[INFO][2024/06/24 03:12:08 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/ND0002/labels/cpv3/(3, 3).h5...
[INFO][2024/06/24 03:12:09 PM] Loading tracks/ob

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


Progress through positions:   0%|          | 0/42 [00:00<?, ?it/s]

[INFO][2024/06/24 03:13:07 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/ND0003/labels/cpv3/(3, 1).h5...
[INFO][2024/06/24 03:13:07 PM] Loading tracks/obj_type_1
[INFO][2024/06/24 03:13:07 PM] Loading LBEP/obj_type_1
[INFO][2024/06/24 03:13:08 PM] Loading objects/obj_type_1 (46213, 5) (46213 filtered: None)
[INFO][2024/06/24 03:13:08 PM] Closing HDF file: /mnt/SYNO/macrohet_syno/data/ND0003/labels/cpv3/(3, 1).h5
[INFO][2024/06/24 03:13:08 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/ND0003/labels/cpv3/(3, 2).h5...
[INFO][2024/06/24 03:13:08 PM] Loading tracks/obj_type_1
[INFO][2024/06/24 03:13:08 PM] Loading LBEP/obj_type_1
[INFO][2024/06/24 03:13:08 PM] Loading objects/obj_type_1 (47573, 5) (47573 filtered: None)
[INFO][2024/06/24 03:13:09 PM] Closing HDF file: /mnt/SYNO/macrohet_syno/data/ND0003/labels/cpv3/(3, 2).h5
[INFO][2024/06/24 03:13:09 PM] Opening HDF file: /mnt/SYNO/macrohet_syno/data/ND0003/labels/cpv3/(3, 3).h5...
[INFO][2024/06/24 03:13:09 PM] Loading tracks/ob

In [7]:
# Initialize the dictionary with the provided information for 'PS0000'
expt_param_dict = {
    'PS0000': {
        'min length': 35,
        'mtb channel': 1,
        'gfp channel': 0,
        'biological replicate': 1
    }
}

expt_param_dict['ND0002'] = {
    'min length': 70, 
    'mtb channel': 0,
    'gfp channel': 1,
    'biological replicate': 2
}

expt_param_dict['ND0003'] = {
    'min length': 70, 
    'mtb channel': 0,
    'gfp channel': 1,
    'biological replicate': 3
}
image_resolution = float(metadata['ImageResolutionX'].iloc[0])
meters_area_per_pixel = image_resolution**2
mum_sq_scale_factor = (1E-6)**2
pixel_to_mum_sq_scale_factor = meters_area_per_pixel/mum_sq_scale_factor

In [21]:
[t/2 for t in track['t']]

[0.0,
 0.5,
 1.0,
 1.5,
 2.0,
 2.5,
 3.0,
 3.5,
 4.0,
 4.5,
 5.0,
 5.5,
 6.0,
 6.5,
 7.0,
 7.5,
 8.0,
 8.5,
 9.0,
 9.5,
 10.0,
 10.5,
 11.0,
 11.5,
 12.0,
 12.5,
 13.0,
 13.5,
 14.0,
 14.5,
 15.0,
 15.5,
 16.0,
 16.5,
 17.0,
 17.5,
 18.0,
 18.5,
 19.0,
 19.5,
 20.0,
 20.5,
 21.0,
 21.5,
 22.0,
 22.5,
 23.0,
 23.5,
 24.0,
 24.5,
 25.0,
 25.5,
 26.0,
 26.5,
 27.0,
 27.5,
 28.0,
 28.5,
 29.0,
 29.5,
 30.0,
 30.5,
 31.0,
 31.5,
 32.0,
 32.5,
 33.0,
 33.5,
 34.0,
 34.5,
 35.0,
 35.5,
 36.0,
 36.5,
 37.0,
 37.5,
 38.0,
 38.5,
 39.0]

In [22]:
### list of track info dfs
dfs = list()
### empty dictionary for filtered tracks
filtered_tracks = dict()

for expt_ID in tqdm(expts):
    
    # extract metadata for each experiment
    base_dir = f'/mnt/SYNO/macrohet_syno/data/{expt_ID}/'
    metadata_fn = os.path.join(base_dir, 'acquisition/Images/Index.idx.xml')
    metadata = dataio.read_harmony_metadata(metadata_fn)  
    metadata_path = glob.glob(os.path.join(base_dir, 'acquisition/Assaylayout/*.xml'))[0]
    assay_layout = dataio.read_harmony_metadata(metadata_path, assay_layout=True,)

    expt_keys = [key for key in tracks_dict.keys() if expt_ID in key]

    ### iterate over all tracks
    for key in tqdm(expt_keys):
        ### extract tracks only with max length
        filtered_tracks[key] = [track for track in tracks_dict[key] if len(track) >= expt_param_dict[expt_ID]['min length']] 
        ### iterate over full length tracks
        for track in filtered_tracks[key]:
            ### get info for assay layout
            info = assay_layout.loc[(key[1], key[2])]
            ### compile single track dictionary of infO

            
            if expt_ID != 'PS0000':
                d = {'Time (hours)':[t/2 for t in track['t']],
                     'Mtb Area (µm)':track['Mtb area px'] * pixel_to_mum_sq_scale_factor,
                     'dMtb Area (µm)': [track['Mtb area px'][-1]*pixel_to_mum_sq_scale_factor - track['Mtb area px'][0]*pixel_to_mum_sq_scale_factor for i in range(len(track['t']))],
                     # 'Mtb Area (px)':track['Mtb area px'],
                     # 'dMtb Area (px)': [track['Mtb area px'][-1] - track['Mtb area px'][0] for i in range(len(track['t']))],
                     'Mphi Area (µm)':track['area'] * pixel_to_mum_sq_scale_factor, 
                     'dMphi Area (µm)': [track['area'][-1]*pixel_to_mum_sq_scale_factor - track['area'][0]*pixel_to_mum_sq_scale_factor for i in range(len(track['t']))],
                     # 'Mphi Area (px)':track['area'], 
                     # 'dMphi Area (px)': [track['area'][-1] - track['area'][0] for i in range(len(track['t']))],
                     'Infection Status':track['Infected'],
                     'Initial Infection Status':track['Infected'][0], 
                     'Final Infection Status':track['Infected'][-1], 
                     'x':track['x'],
                     'y':track['y'],
                     # 'x scaled':[track['x'][i]*5.04 for i, x in enumerate(track['x'])],
                     # 'y scaled':[track['y'][i]*5.04 for i, y in enumerate(track['y'])],
                     'GFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['gfp channel']],
                     'RFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['mtb channel']],
                     # 'dRFP': [track['mean_intensity'][-1, mtb_channel] - track['mean_intensity'][0, mtb_channel] for i in range(len(track['t']))],
                     # 'Intracellular thresholded Mtb content':track['mean_intensity'][:, 2],
                     'Eccentricity':np.sqrt(1-((track['minor_axis_length']**2)/(track['major_axis_length']**2))),
                     'MSD': [tools.euc_dist(track['x'][i-1], 
                                            track['y'][i-1], 
                                            track['x'][i], 
                                            track['y'][i]) 
                              if i != 0 else 0
                              for i in range(0, len(track))],
                     'Technical Replicate':[info['Replicate #'] for i in range(len(track['t']))], 
                     'Biological Replicate': [expt_param_dict[expt_ID]['biological replicate'] for i in range(len(track['t']))],
                     'Strain':[info['Strain'] for i in range(len(track['t']))], 
                     'Compound':[info['Compound'] for i in range(len(track['t']))], 
                     'Concentration':[info['ConcentrationEC'] for i in range(len(track['t']))], 
                     'Cell ID':[track.ID for i in range(len(track['t']))],
                     'Acquisition ID':[(key[1],key[2]) for i in range(len(track['t']))], 
                     'Experiment ID':[key[0] for i in range(len(track['t']))], 
                     'Unique ID': [f'{track.ID}.{key[1]}.{key[2]}' for i in range(len(track['t']))], 
                     'ID': [f'{track.ID}.{key[1]}.{key[2]}.{key[0]}' for i in range(len(track['t']))]}
            else:
                d = {'Time (hours)':track['t'], 
                     'Mtb Area (µm)':None,
                     'dMtb Area (µm)':None,
                     # 'Mtb Area (px)':track['Mtb area px'],
                     # 'dMtb Area (px)': [track['Mtb area px'][-1] - track['Mtb area px'][0] for i in range(len(track['t']))],
                     'Mphi Area (µm)':track['area'] * pixel_to_mum_sq_scale_factor, 
                     'dMphi Area (µm)': [track['area'][-1]*pixel_to_mum_sq_scale_factor - track['area'][0]*pixel_to_mum_sq_scale_factor for i in range(len(track['t']))],
                     # 'Mphi Area (px)':track['area'], 
                     # 'dMphi Area (px)': [track['area'][-1] - track['area'][0] for i in range(len(track['t']))],
                     'Infection Status':track['Infected'],
                     'Initial Infection Status':track['Infected'][0], 
                     'Final Infection Status':track['Infected'][-1], 
                     'x':track['x'],
                     'y':track['y'],
                     # 'x scaled':[track['x'][i]*5.04 for i, x in enumerate(track['x'])],
                     # 'y scaled':[track['y'][i]*5.04 for i, y in enumerate(track['y'])],
                     'GFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['gfp channel']],
                     'RFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['mtb channel']],
                     # 'dRFP': [track['mean_intensity'][-1, mtb_channel] - track['mean_intensity'][0, mtb_channel] for i in range(len(track['t']))],
                     # 'Intracellular thresholded Mtb content':track['mean_intensity'][:, 2],
                     'Eccentricity':np.sqrt(1-((track['minor_axis_length']**2)/(track['major_axis_length']**2))),
                     'MSD': [tools.euc_dist(track['x'][i-1], 
                                            track['y'][i-1], 
                                            track['x'][i], 
                                            track['y'][i]) 
                              if i != 0 else 0
                              for i in range(0, len(track))],
                     'Technical Replicate':[info['Replicate #'] for i in range(len(track['t']))], 
                     'Biological Replicate': [expt_param_dict[expt_ID]['biological replicate'] for i in range(len(track['t']))],
                     'Strain':[info['Strain'] for i in range(len(track['t']))], 
                     'Compound':[info['Compound'] for i in range(len(track['t']))], 
                     'Concentration':[info['ConcentrationEC'] for i in range(len(track['t']))], 
                     'Cell ID':[track.ID for i in range(len(track['t']))],
                     'Acquisition ID':[(key[1],key[2]) for i in range(len(track['t']))], 
                     'Experiment ID':[key[0] for i in range(len(track['t']))], 
                     'Unique ID': [f'{track.ID}.{key[1]}.{key[2]}' for i in range(len(track['t']))], 
                     'ID': [f'{track.ID}.{key[1]}.{key[2]}.{key[0]}' for i in range(len(track['t']))]}
            ### append df to list of dfs
            dfs.append(pd.DataFrame(d))
### concat single track dfs into big df
df = pd.concat(dfs, ignore_index=True)
df

  0%|          | 0/3 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


  0%|          | 0/24 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


  0%|          | 0/41 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


  0%|          | 0/42 [00:00<?, ?it/s]

  df = pd.concat(dfs, ignore_index=True)


Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Technical Replicate,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID
0,0.0,,,951.061153,-526.999084,0.0,0.0,1.0,708.125061,77.589088,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
1,1.0,,,1088.973512,-526.999084,0.0,0.0,1.0,725.449097,73.245003,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
2,2.0,,,971.800525,-526.999084,0.0,0.0,1.0,723.088928,76.431496,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
3,3.0,,,874.115398,-526.999084,0.0,0.0,1.0,726.009766,79.395073,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
4,4.0,,,729.543196,-526.999084,0.0,0.0,1.0,728.209656,82.197769,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1797106,74.5,1.117423,1.095075,752.584461,55.088959,1.0,1.0,1.0,227.093262,130.358139,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1797107,75.0,2.033710,1.095075,616.035357,55.088959,1.0,1.0,1.0,226.622925,130.532120,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1797108,75.5,1.430302,1.095075,579.182743,55.088959,1.0,1.0,1.0,225.366608,129.932709,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1797109,76.0,1.989013,1.095075,679.147414,55.088959,1.0,1.0,1.0,226.424683,128.489410,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003


In [23]:
len((df['Unique ID'].unique()))

21131

In [24]:
df.to_pickle('temp.pkl')

# Separating dataframes so that I can integrate previous GT tracks from PS000

In [94]:
previous_df = pd.read_pickle('/mnt/SYNO/macrohet_syno/data/PS0000/results/preliminary_sc_measures/sc_dfs/sc_df_GT_70_area_false_outliers_removed_correct.pkl')

In [95]:
# for formatting purposes, this turns them into float
previous_df['Time (hours)'] = previous_df['Time (hours)']*1.0

In [96]:
previous_df['Mphi Area (µm)'] = previous_df['Mphi Area']*pixel_to_mum_sq_scale_factor

In [97]:
previous_df['dMphi Area (µm)'] = previous_df.groupby('Unique ID')['Mphi Area (µm)'].transform(lambda x: x.iloc[-1] - x.iloc[0])

In [98]:
previous_df.rename(columns={'Mtb Area':'Mtb Area (µm)'}, inplace=True)
previous_df.rename(columns={'dMtb':'dMtb Area (µm)'}, inplace=True)
previous_df.rename(columns={'Mphi Area':'Mphi Area (µm)'}, inplace=True)
previous_df.rename(columns={'Mphi GFP':'GFP'}, inplace=True)
previous_df.rename(columns={'Mean Mtb Intensity':'RFP'}, inplace=True)

In [99]:
previous_df['Experiment ID'] = 'PS0000'

In [100]:
base_dir = f'/mnt/SYNO/macrohet_syno/data/PS0000'
metadata_path = glob.glob(os.path.join(base_dir, 'acquisition/Assaylayout/*.xml'))[0]
assay_layout = dataio.read_harmony_metadata(metadata_path, assay_layout=True, replicate_number = True)
assay_layout

Reading metadata XML file...
Extracting metadata complete!


Unnamed: 0_level_0,Unnamed: 1_level_0,Strain,Compound,Concentration,ConcentrationEC,Replicate #
Row,Column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,4,RD1,CTRL,0.0,EC0,1
3,5,WT,CTRL,0.0,EC0,1
3,6,WT,PZA,60.0,EC50,1
3,7,WT,RIF,0.1,EC50,1
3,8,WT,INH,0.04,EC50,1
3,9,WT,BDQ,0.02,EC50,1
4,4,RD1,CTRL,0.0,EC0,2
4,5,WT,CTRL,0.0,EC0,2
4,6,WT,PZA,60.0,EC50,2
4,7,WT,RIF,0.1,EC50,2


In [101]:
assay_layout['Acquisition ID'] = assay_layout.index.to_list()
assay_layout

Unnamed: 0_level_0,Unnamed: 1_level_0,Strain,Compound,Concentration,ConcentrationEC,Replicate #,Acquisition ID
Row,Column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,4,RD1,CTRL,0.0,EC0,1,"(3, 4)"
3,5,WT,CTRL,0.0,EC0,1,"(3, 5)"
3,6,WT,PZA,60.0,EC50,1,"(3, 6)"
3,7,WT,RIF,0.1,EC50,1,"(3, 7)"
3,8,WT,INH,0.04,EC50,1,"(3, 8)"
3,9,WT,BDQ,0.02,EC50,1,"(3, 9)"
4,4,RD1,CTRL,0.0,EC0,2,"(4, 4)"
4,5,WT,CTRL,0.0,EC0,2,"(4, 5)"
4,6,WT,PZA,60.0,EC50,2,"(4, 6)"
4,7,WT,RIF,0.1,EC50,2,"(4, 7)"


In [102]:
acquisition_to_replicate = assay_layout.set_index('Acquisition ID')['Replicate #'].to_dict()

In [103]:
previous_df['Technical Replicate'] = previous_df['Acquisition ID'].map(acquisition_to_replicate)
previous_df['Biological Replicate'] = 1

In [104]:
previous_df['Technical Replicate'] = previous_df['Acquisition ID'].map(acquisition_to_replicate)

In [105]:
previous_df.keys(), df.keys()

(Index(['Time (hours)', 'Mtb Area (µm)', 'dMtb Area (µm)', 'Strain', 'Compound',
        'Concentration', 'Unique ID', 'Acquisition ID', 'Cell ID',
        'Mphi Area (µm)', 'GFP', 'x', 'y', 'Eccentricity', 'MSD',
        'Technical Replicate', 'Strain/Compound', 'dMtb/dt', '%dMtb/dt',
        '%dMtb', 'RFP', 'Thresholded Mean Mtb Intensity', 'Infection Status',
        'Initial Infection Status', 'Final Infection Status', 'dMtb/dt,T=15',
        'Max. dMtb/dt,T=15', 'Mphi Area (µm)', 'dMphi Area (µm)',
        'Experiment ID', 'Biological Replicate'],
       dtype='object'),
 Index(['Time (hours)', 'Mtb Area (µm)', 'dMtb Area (µm)', 'Mphi Area (µm)',
        'dMphi Area (µm)', 'Infection Status', 'Initial Infection Status',
        'Final Infection Status', 'x', 'y', 'GFP', 'RFP', 'Eccentricity', 'MSD',
        'Technical Replicate', 'Biological Replicate', 'Strain', 'Compound',
        'Concentration', 'Cell ID', 'Acquisition ID', 'Experiment ID',
        'Unique ID', 'ID', 'tempID']

In [106]:
previous_df['ID'] = previous_df.apply(lambda row: f"{row['Unique ID']}.{row['Experiment ID']}", axis=1)

In [107]:
# Assuming you have two DataFrames: previous_df and new_df

# Get the columns present in both DataFrames
common_columns = previous_df.columns.intersection(df.columns)

print(common_columns)
# Assuming you have two DataFrames: df and previous_df

# Get the columns in df that are not in previous_df
unique_columns = previous_df.columns.difference(df.columns)

print(unique_columns)


Index(['Time (hours)', 'Mtb Area (µm)', 'dMtb Area (µm)', 'Strain', 'Compound',
       'Concentration', 'Unique ID', 'Acquisition ID', 'Cell ID',
       'Mphi Area (µm)', 'GFP', 'x', 'y', 'Eccentricity', 'MSD',
       'Technical Replicate', 'RFP', 'Infection Status',
       'Initial Infection Status', 'Final Infection Status', 'dMphi Area (µm)',
       'Experiment ID', 'Biological Replicate', 'ID'],
      dtype='object')
Index(['%dMtb', '%dMtb/dt', 'Max. dMtb/dt,T=15', 'Strain/Compound',
       'Thresholded Mean Mtb Intensity', 'dMtb/dt', 'dMtb/dt,T=15'],
      dtype='object')


# Merge the data

In [108]:
previous_df['tempID'] = previous_df.apply(lambda row: f"{row['ID']}.{row['Time (hours)']}", axis=1)
df['tempID'] = df.apply(lambda row: f"{row['ID']}.{row['Time (hours)']}", axis=1)

In [109]:
previous_df[['Mtb Area (µm)', 'dMtb Area (µm)', 'tempID']]

Unnamed: 0,Mtb Area (µm),dMtb Area (µm),tempID
0,46.797680,136.772588,1.3.5.PS0000.0.0
1,48.719647,136.772588,1.3.5.PS0000.1.0
2,52.206007,136.772588,1.3.5.PS0000.2.0
3,50.552221,136.772588,1.3.5.PS0000.3.0
4,54.463202,136.772588,1.3.5.PS0000.4.0
...,...,...,...
296992,0.000000,-3.888632,996.4.5.PS0000.70.0
296993,0.000000,-3.888632,996.4.5.PS0000.71.0
296994,0.000000,-3.888632,996.4.5.PS0000.72.0
296995,0.000000,-3.888632,996.4.5.PS0000.73.0


In [110]:
correct_IDs = previous_df['tempID'].values
correct_IDs

array(['1.3.5.PS0000.0.0', '1.3.5.PS0000.1.0', '1.3.5.PS0000.2.0', ...,
       '996.4.5.PS0000.72.0', '996.4.5.PS0000.73.0',
       '996.4.5.PS0000.74.0'], dtype=object)

In [111]:
ps_df = df[df['Experiment ID'] == 'PS0000']

In [112]:
ps_df

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID,tempID
0,0.0,,,951.061153,-526.999084,0.0,0.0,1.0,708.125061,77.589088,...,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000,433.3.4.PS0000.0.0
1,1.0,,,1088.973512,-526.999084,0.0,0.0,1.0,725.449097,73.245003,...,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000,433.3.4.PS0000.1.0
2,2.0,,,971.800525,-526.999084,0.0,0.0,1.0,723.088928,76.431496,...,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000,433.3.4.PS0000.2.0
3,3.0,,,874.115398,-526.999084,0.0,0.0,1.0,726.009766,79.395073,...,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000,433.3.4.PS0000.3.0
4,4.0,,,729.543196,-526.999084,0.0,0.0,1.0,728.209656,82.197769,...,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000,433.3.4.PS0000.4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773623,70.0,,,268.025106,76.811664,0.0,0.0,0.0,794.876343,643.271118,...,1,WT,BDQ,EC99,2443,"(6, 9)",PS0000,2443.6.9,2443.6.9.PS0000,2443.6.9.PS0000.70.0
773624,71.0,,,,76.811664,,0.0,0.0,794.739807,643.190918,...,1,WT,BDQ,EC99,2443,"(6, 9)",PS0000,2443.6.9,2443.6.9.PS0000,2443.6.9.PS0000.71.0
773625,72.0,,,292.608414,76.811664,0.0,0.0,0.0,794.308044,644.506714,...,1,WT,BDQ,EC99,2443,"(6, 9)",PS0000,2443.6.9,2443.6.9.PS0000,2443.6.9.PS0000.72.0
773626,73.0,,,,76.811664,,0.0,0.0,794.442871,644.764648,...,1,WT,BDQ,EC99,2443,"(6, 9)",PS0000,2443.6.9,2443.6.9.PS0000,2443.6.9.PS0000.73.0


In [113]:
correct_IDs

array(['1.3.5.PS0000.0.0', '1.3.5.PS0000.1.0', '1.3.5.PS0000.2.0', ...,
       '996.4.5.PS0000.72.0', '996.4.5.PS0000.73.0',
       '996.4.5.PS0000.74.0'], dtype=object)

In [114]:
ps_df['tempID']

0           433.3.4.PS0000.0.0
1           433.3.4.PS0000.1.0
2           433.3.4.PS0000.2.0
3           433.3.4.PS0000.3.0
4           433.3.4.PS0000.4.0
                  ...         
773623    2443.6.9.PS0000.70.0
773624    2443.6.9.PS0000.71.0
773625    2443.6.9.PS0000.72.0
773626    2443.6.9.PS0000.73.0
773627    2443.6.9.PS0000.74.0
Name: tempID, Length: 773628, dtype: object

In [115]:
ps_df[ps_df['tempID'].isin(correct_IDs)]

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID,tempID
152,0.0,,,459.082108,-92.746118,1.0,1.0,0.0,457.172943,8.791715,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,426.3.4.PS0000.0.0
153,1.0,,,372.839393,-92.746118,0.0,1.0,0.0,459.138947,6.718642,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,426.3.4.PS0000.1.0
154,2.0,,,423.168130,-92.746118,0.0,1.0,0.0,460.555237,10.785886,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,426.3.4.PS0000.2.0
155,3.0,,,445.561289,-92.746118,1.0,1.0,0.0,455.878815,11.143067,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,426.3.4.PS0000.3.0
156,4.0,,,487.040034,-92.746118,1.0,1.0,0.0,462.634186,14.050420,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,426.3.4.PS0000.4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767447,70.0,,,2279.341997,2028.994524,1.0,0.0,1.0,628.562500,1099.782104,...,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000,935.6.9.PS0000.70.0
767448,71.0,,,2256.703005,2028.994524,1.0,0.0,1.0,628.463684,1102.144409,...,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000,935.6.9.PS0000.71.0
767449,72.0,,,2422.506245,2028.994524,1.0,0.0,1.0,628.672058,1101.594727,...,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000,935.6.9.PS0000.72.0
767450,73.0,,,2393.050972,2028.994524,1.0,0.0,1.0,630.372803,1102.817749,...,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000,935.6.9.PS0000.73.0


In [116]:
ps_df = ps_df[ps_df['tempID'].isin(correct_IDs)]

In [117]:
ps_df = ps_df.merge(previous_df[['tempID', 'Mtb Area (µm)', 'dMtb Area (µm)']],
                    on='tempID', 
                    how='left',
                    suffixes=('', '_from_previous'))

In [118]:
ps_df['Mtb Area (µm)'] = ps_df['Mtb Area (µm)_from_previous'].fillna(ps_df['Mtb Area (µm)'])
ps_df['dMtb Area (µm)'] = ps_df['dMtb Area (µm)_from_previous'].fillna(ps_df['dMtb Area (µm)'])

In [119]:
ps_df.drop(['Mtb Area (µm)_from_previous', 'dMtb Area (µm)_from_previous', 'tempID'], axis=1, inplace=True)

In [120]:
ps_df

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Technical Replicate,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID
0,0.0,0.424621,-0.424621,459.082108,-92.746118,1.0,1.0,0.0,457.172943,8.791715,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
1,1.0,0.000000,-0.424621,372.839393,-92.746118,0.0,1.0,0.0,459.138947,6.718642,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
2,2.0,0.000000,-0.424621,423.168130,-92.746118,0.0,1.0,0.0,460.555237,10.785886,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
3,3.0,0.335227,-0.424621,445.561289,-92.746118,1.0,1.0,0.0,455.878815,11.143067,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
4,4.0,0.000000,-0.424621,487.040034,-92.746118,1.0,1.0,0.0,462.634186,14.050420,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291521,70.0,5.028404,4.559086,2279.341997,2028.994524,1.0,0.0,1.0,628.562500,1099.782104,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000
291522,71.0,5.564767,4.559086,2256.703005,2028.994524,1.0,0.0,1.0,628.463684,1102.144409,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000
291523,72.0,5.274237,4.559086,2422.506245,2028.994524,1.0,0.0,1.0,628.672058,1101.594727,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000
291524,73.0,5.140146,4.559086,2393.050972,2028.994524,1.0,0.0,1.0,630.372803,1102.817749,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000


In [121]:
final_df = pd.concat([ps_df, df[df['Experiment ID'] != 'PS0000']], axis=0)

In [122]:
final_df

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID,tempID
0,0.0,0.424621,-0.424621,459.082108,-92.746118,1.0,1.0,0.0,457.172943,8.791715,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,
1,1.0,0.000000,-0.424621,372.839393,-92.746118,0.0,1.0,0.0,459.138947,6.718642,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,
2,2.0,0.000000,-0.424621,423.168130,-92.746118,0.0,1.0,0.0,460.555237,10.785886,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,
3,3.0,0.335227,-0.424621,445.561289,-92.746118,1.0,1.0,0.0,455.878815,11.143067,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,
4,4.0,0.000000,-0.424621,487.040034,-92.746118,1.0,1.0,0.0,462.634186,14.050420,...,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1797106,74.5,1.117423,1.095075,752.584461,55.088959,1.0,1.0,1.0,227.093262,130.358139,...,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003,1874.6.12.ND0003.74.5
1797107,75.0,2.033710,1.095075,616.035357,55.088959,1.0,1.0,1.0,226.622925,130.532120,...,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003,1874.6.12.ND0003.75.0
1797108,75.5,1.430302,1.095075,579.182743,55.088959,1.0,1.0,1.0,225.366608,129.932709,...,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003,1874.6.12.ND0003.75.5
1797109,76.0,1.989013,1.095075,679.147414,55.088959,1.0,1.0,1.0,226.424683,128.489410,...,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003,1874.6.12.ND0003.76.0


In [131]:
final_df[final_df['ID'] == '220.3.1.ND0003']['Mtb Area (µm)'])

989742    0.0
989743    0.0
989744    0.0
989745    0.0
989746    NaN
         ... 
989808    0.0
989809    0.0
989810    0.0
989811    0.0
989812    0.0
Name: Mtb Area (µm), Length: 71, dtype: float64

In [126]:
os.path.exists('/mnt/SYNO/macrohet_syno/results/sc_df.pkl')

True

In [124]:
final_df.to_pickle('/mnt/SYNO/macrohet_syno/results/sc_df.pkl')

In [128]:
final_df.to_csv('/mnt/SYNO/macrohet_syno/results/dfs/sc_df.csv')