# A notebook for collating the SC data

In [3]:
import btrack
import os
from macrohet import dataio, tools
import glob
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

### To do:

1. Check area Mtb measurements are in agreement
2. Homogenise method to measure area, dmtb, doubling time etc
3. Ensure continuity between minimum track length across expts

### Define scope of analysis

In [9]:
expts = ['PS0000', 'ND0002', 'ND0003']

In [10]:
error_IDs = []
# load tracks 
tracks_dict = dict()
for expt_ID in tqdm(expts):
    
    # extract metadata for each experiment
    base_dir = f'/mnt/SYNO/macrohet_syno/{expt_ID}/'
    metadata_fn = os.path.join(base_dir, 'acquisition/Images/Index.idx.xml')
    metadata = dataio.read_harmony_metadata(metadata_fn)  
    metadata_path = glob.glob(os.path.join(base_dir, 'acquisition/Assaylayout/*.xml'))[0]
    assay_layout = dataio.read_harmony_metadata(metadata_path, assay_layout=True,)

    
    ### iterate over all experimental conditions
    for (row, column), info in tqdm(assay_layout.iterrows(), 
                                    desc = 'Progress through positions',
                                    total = len(assay_layout)):
        try:
            # define paths dependent on model
            if expt_ID == 'PS0000':
                track_path = os.path.join(base_dir, f'labels/macrohet_seg_model/{row,column}.h5')
            else:
                track_path = os.path.join(base_dir, f'labels/cpv3/{row,column}.h5')

            ### load tracks
            with btrack.io.HDF5FileHandler(track_path, 
                                               'r', 
                                               obj_type='obj_type_1'
                                               ) as reader:
                tracks = reader.tracks
                    
            ### append tracks to dictionary
            tracks_dict[(expt_ID, row, column)] = tracks
        except:
            error_IDs.append((expt_ID, row, column))

  0%|          | 0/3 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


Progress through positions:   0%|          | 0/24 [00:00<?, ?it/s]

[INFO][2024/04/04 11:50:08 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/PS0000/labels/macrohet_seg_model/(3, 4).h5...
[INFO][2024/04/04 11:50:08 AM] Loading tracks/obj_type_1
[INFO][2024/04/04 11:50:08 AM] Loading LBEP/obj_type_1
[INFO][2024/04/04 11:50:09 AM] Loading objects/obj_type_1 (39878, 5) (39878 filtered: None)
[INFO][2024/04/04 11:50:09 AM] Closing HDF file: /mnt/SYNO/macrohet_syno/PS0000/labels/macrohet_seg_model/(3, 4).h5
[INFO][2024/04/04 11:50:09 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/PS0000/labels/macrohet_seg_model/(3, 5).h5...
[INFO][2024/04/04 11:50:09 AM] Loading tracks/obj_type_1
[INFO][2024/04/04 11:50:09 AM] Loading LBEP/obj_type_1
[INFO][2024/04/04 11:50:10 AM] Loading objects/obj_type_1 (41424, 5) (41424 filtered: None)
[INFO][2024/04/04 11:50:10 AM] Closing HDF file: /mnt/SYNO/macrohet_syno/PS0000/labels/macrohet_seg_model/(3, 5).h5
[INFO][2024/04/04 11:50:10 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/PS0000/labels/macrohet_seg_model/(3, 6).h5...
[IN

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


Progress through positions:   0%|          | 0/42 [00:00<?, ?it/s]

[INFO][2024/04/04 11:51:10 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/ND0002/labels/cpv3/(3, 1).h5...
[INFO][2024/04/04 11:51:10 AM] Loading tracks/obj_type_1
[INFO][2024/04/04 11:51:10 AM] Loading LBEP/obj_type_1
[INFO][2024/04/04 11:51:10 AM] Loading objects/obj_type_1 (15525, 5) (15525 filtered: None)
[INFO][2024/04/04 11:51:11 AM] Closing HDF file: /mnt/SYNO/macrohet_syno/ND0002/labels/cpv3/(3, 1).h5
[INFO][2024/04/04 11:51:11 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/ND0002/labels/cpv3/(3, 2).h5...
[INFO][2024/04/04 11:51:11 AM] Loading tracks/obj_type_1
[INFO][2024/04/04 11:51:11 AM] Loading LBEP/obj_type_1
[INFO][2024/04/04 11:51:11 AM] Loading objects/obj_type_1 (15189, 5) (15189 filtered: None)
[INFO][2024/04/04 11:51:11 AM] Closing HDF file: /mnt/SYNO/macrohet_syno/ND0002/labels/cpv3/(3, 2).h5
[INFO][2024/04/04 11:51:11 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/ND0002/labels/cpv3/(3, 3).h5...
[INFO][2024/04/04 11:51:11 AM] Loading tracks/obj_type_1
[INFO][2024/04/0

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


Progress through positions:   0%|          | 0/42 [00:00<?, ?it/s]

[INFO][2024/04/04 11:52:04 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/ND0003/labels/cpv3/(3, 1).h5...
[INFO][2024/04/04 11:52:04 AM] Loading tracks/obj_type_1
[INFO][2024/04/04 11:52:04 AM] Loading LBEP/obj_type_1
[INFO][2024/04/04 11:52:04 AM] Loading objects/obj_type_1 (46213, 5) (46213 filtered: None)
[INFO][2024/04/04 11:52:07 AM] Closing HDF file: /mnt/SYNO/macrohet_syno/ND0003/labels/cpv3/(3, 1).h5
[INFO][2024/04/04 11:52:07 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/ND0003/labels/cpv3/(3, 2).h5...
[INFO][2024/04/04 11:52:07 AM] Loading tracks/obj_type_1
[INFO][2024/04/04 11:52:07 AM] Loading LBEP/obj_type_1
[INFO][2024/04/04 11:52:07 AM] Loading objects/obj_type_1 (47573, 5) (47573 filtered: None)
[INFO][2024/04/04 11:52:08 AM] Closing HDF file: /mnt/SYNO/macrohet_syno/ND0003/labels/cpv3/(3, 2).h5
[INFO][2024/04/04 11:52:08 AM] Opening HDF file: /mnt/SYNO/macrohet_syno/ND0003/labels/cpv3/(3, 3).h5...
[INFO][2024/04/04 11:52:08 AM] Loading tracks/obj_type_1
[INFO][2024/04/0

In [15]:
# Initialize the dictionary with the provided information for 'PS0000'
expt_param_dict = {
    'PS0000': {
        'min length': 35,
        'mtb channel': 1,
        'gfp channel': 0,
        'biological replicate': 1
    }
}

expt_param_dict['ND0002'] = {
    'min length': 70, 
    'mtb channel': 0,
    'gfp channel': 1,
    'biological replicate': 2
}

expt_param_dict['ND0003'] = {
    'min length': 70, 
    'mtb channel': 0,
    'gfp channel': 1,
    'biological replicate': 3
}
image_resolution = float(metadata['ImageResolutionX'].iloc[0])
meters_area_per_pixel = image_resolution**2
mum_sq_scale_factor = (1E-6)**2
pixel_to_mum_sq_scale_factor = meters_area_per_pixel/mum_sq_scale_factor

In [19]:
tracks_dict[('PS0000', 3, 4)][9]

Unnamed: 0,ID,t,x,y,z,parent,root,state,generation,dummy,Infected,area,major_axis_length,orientation,mean_intensity,minor_axis_length
0,431,0,1170.983887,13.310156,0.0,431,431,5,0,False,0.0,16481.0,190.778381,0.871957,"(3,) array",111.131699
1,431,1,1171.636597,13.921395,0.0,431,431,5,0,False,0.0,23359.0,225.327744,1.081966,"(3,) array",133.332199
2,431,2,1175.257446,12.739542,0.0,431,431,5,0,False,0.0,19475.0,193.105621,1.085899,"(3,) array",129.758133
3,431,3,1174.886719,10.947847,0.0,431,431,5,0,False,0.0,16054.0,173.431442,1.250856,"(3,) array",119.268036
4,431,4,1181.055908,9.864869,0.0,431,431,5,0,False,0.0,12926.0,161.761551,1.217899,"(3,) array",102.628525
5,431,5,1183.272461,13.188028,0.0,431,431,5,0,False,0.0,20224.0,169.591217,-1.233302,"(3,) array",156.272781
6,431,6,1182.75647,14.250345,0.0,431,431,5,0,False,0.0,22585.0,176.537781,-0.989864,"(3,) array",168.108032
7,431,7,1184.591309,14.224337,0.0,431,431,5,0,False,0.0,19063.0,175.395676,-0.608063,"(3,) array",140.854935
8,431,8,1184.145142,15.66828,0.0,431,431,5,0,False,0.0,21243.0,188.66478,-0.497698,"(3,) array",147.107819
9,431,9,1182.591309,14.40191,0.0,431,431,5,0,False,0.0,19574.0,192.702652,-0.690033,"(3,) array",132.717529


In [23]:
### list of track info dfs
dfs = list()
### empty dictionary for filtered tracks
filtered_tracks = dict()

for expt_ID in tqdm(expts):
    
    # extract metadata for each experiment
    base_dir = f'/mnt/SYNO/macrohet_syno/{expt_ID}/'
    metadata_fn = os.path.join(base_dir, 'acquisition/Images/Index.idx.xml')
    metadata = dataio.read_harmony_metadata(metadata_fn)  
    metadata_path = glob.glob(os.path.join(base_dir, 'acquisition/Assaylayout/*.xml'))[0]
    assay_layout = dataio.read_harmony_metadata(metadata_path, assay_layout=True,)

    expt_keys = [key for key in tracks_dict.keys() if expt_ID in key]

    ### iterate over all tracks
    for key in tqdm(expt_keys):
        ### extract tracks only with max length
        filtered_tracks[key] = [track for track in tracks_dict[key] if len(track) >= expt_param_dict[expt_ID]['min length']] 
        ### iterate over full length tracks
        for track in filtered_tracks[key]:
            ### get info for assay layout
            info = assay_layout.loc[(key[1], key[2])]
            ### compile single track dictionary of infO

            
            if expt_ID != 'PS0000':
                d = {'Time (hours)':track['t']/2, 
                     'Mtb Area (µm)':track['Mtb area px'] * pixel_to_mum_sq_scale_factor,
                     'dMtb Area (µm)': [track['Mtb area px'][-1]*pixel_to_mum_sq_scale_factor - track['Mtb area px'][0]*pixel_to_mum_sq_scale_factor for i in range(len(track['t']))],
                     # 'Mtb Area (px)':track['Mtb area px'],
                     # 'dMtb Area (px)': [track['Mtb area px'][-1] - track['Mtb area px'][0] for i in range(len(track['t']))],
                     'Mphi Area (µm)':track['area'] * pixel_to_mum_sq_scale_factor, 
                     'dMphi Area (µm)': [track['area'][-1]*pixel_to_mum_sq_scale_factor - track['area'][0]*pixel_to_mum_sq_scale_factor for i in range(len(track['t']))],
                     # 'Mphi Area (px)':track['area'], 
                     # 'dMphi Area (px)': [track['area'][-1] - track['area'][0] for i in range(len(track['t']))],
                     'Infection Status':track['Infected'],
                     'Initial Infection Status':track['Infected'][0], 
                     'Final Infection Status':track['Infected'][-1], 
                     'x':track['x'],
                     'y':track['y'],
                     # 'x scaled':[track['x'][i]*5.04 for i, x in enumerate(track['x'])],
                     # 'y scaled':[track['y'][i]*5.04 for i, y in enumerate(track['y'])],
                     'GFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['gfp channel']],
                     'RFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['mtb channel']],
                     # 'dRFP': [track['mean_intensity'][-1, mtb_channel] - track['mean_intensity'][0, mtb_channel] for i in range(len(track['t']))],
                     # 'Intracellular thresholded Mtb content':track['mean_intensity'][:, 2],
                     'Eccentricity':np.sqrt(1-((track['minor_axis_length']**2)/(track['major_axis_length']**2))),
                     'MSD': [tools.euc_dist(track['x'][i-1], 
                                            track['y'][i-1], 
                                            track['x'][i], 
                                            track['y'][i]) 
                              if i != 0 else 0
                              for i in range(0, len(track))],
                     'Technical Replicate':[info['Replicate #'] for i in range(len(track['t']))], 
                     'Biological Replicate': [expt_param_dict[expt_ID]['biological replicate'] for i in range(len(track['t']))],
                     'Strain':[info['Strain'] for i in range(len(track['t']))], 
                     'Compound':[info['Compound'] for i in range(len(track['t']))], 
                     'Concentration':[info['ConcentrationEC'] for i in range(len(track['t']))], 
                     'Cell ID':[track.ID for i in range(len(track['t']))],
                     'Acquisition ID':[(key[1],key[2]) for i in range(len(track['t']))], 
                     'Experiment ID':[key[0] for i in range(len(track['t']))], 
                     'Unique ID': [f'{track.ID}.{key[1]}.{key[2]}' for i in range(len(track['t']))], 
                     'ID': [f'{track.ID}.{key[1]}.{key[2]}.{key[0]}' for i in range(len(track['t']))]}
            else:
                d = {'Time (hours)':track['t'], 
                     'Mtb Area (µm)':None,
                     'dMtb Area (µm)':None,
                     # 'Mtb Area (px)':track['Mtb area px'],
                     # 'dMtb Area (px)': [track['Mtb area px'][-1] - track['Mtb area px'][0] for i in range(len(track['t']))],
                     'Mphi Area (µm)':track['area'] * pixel_to_mum_sq_scale_factor, 
                     'dMphi Area (µm)': [track['area'][-1]*pixel_to_mum_sq_scale_factor - track['area'][0]*pixel_to_mum_sq_scale_factor for i in range(len(track['t']))],
                     # 'Mphi Area (px)':track['area'], 
                     # 'dMphi Area (px)': [track['area'][-1] - track['area'][0] for i in range(len(track['t']))],
                     'Infection Status':track['Infected'],
                     'Initial Infection Status':track['Infected'][0], 
                     'Final Infection Status':track['Infected'][-1], 
                     'x':track['x'],
                     'y':track['y'],
                     # 'x scaled':[track['x'][i]*5.04 for i, x in enumerate(track['x'])],
                     # 'y scaled':[track['y'][i]*5.04 for i, y in enumerate(track['y'])],
                     'GFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['gfp channel']],
                     'RFP':track['mean_intensity'][:, expt_param_dict[expt_ID]['mtb channel']],
                     # 'dRFP': [track['mean_intensity'][-1, mtb_channel] - track['mean_intensity'][0, mtb_channel] for i in range(len(track['t']))],
                     # 'Intracellular thresholded Mtb content':track['mean_intensity'][:, 2],
                     'Eccentricity':np.sqrt(1-((track['minor_axis_length']**2)/(track['major_axis_length']**2))),
                     'MSD': [tools.euc_dist(track['x'][i-1], 
                                            track['y'][i-1], 
                                            track['x'][i], 
                                            track['y'][i]) 
                              if i != 0 else 0
                              for i in range(0, len(track))],
                     'Technical Replicate':[info['Replicate #'] for i in range(len(track['t']))], 
                     'Biological Replicate': [expt_param_dict[expt_ID]['biological replicate'] for i in range(len(track['t']))],
                     'Strain':[info['Strain'] for i in range(len(track['t']))], 
                     'Compound':[info['Compound'] for i in range(len(track['t']))], 
                     'Concentration':[info['ConcentrationEC'] for i in range(len(track['t']))], 
                     'Cell ID':[track.ID for i in range(len(track['t']))],
                     'Acquisition ID':[(key[1],key[2]) for i in range(len(track['t']))], 
                     'Experiment ID':[key[0] for i in range(len(track['t']))], 
                     'Unique ID': [f'{track.ID}.{key[1]}.{key[2]}' for i in range(len(track['t']))], 
                     'ID': [f'{track.ID}.{key[1]}.{key[2]}.{key[0]}' for i in range(len(track['t']))]}
            ### append df to list of dfs
            dfs.append(pd.DataFrame(d))
### concat single track dfs into big df
df = pd.concat(dfs, ignore_index=True)
df

  0%|          | 0/3 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


  0%|          | 0/24 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


  0%|          | 0/41 [00:00<?, ?it/s]

Reading metadata XML file...


0it [00:00, ?it/s]

Extracting metadata complete!
Reading metadata XML file...
Extracting metadata complete!


  0%|          | 0/37 [00:00<?, ?it/s]

  df = pd.concat(dfs, ignore_index=True)


Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection status,Initial Infection Status,Final Infection Status,x,y,...,Technical Replicate,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID
0,0,,,951.061153,-526.999084,0.0,0.0,1.0,708.125061,77.589088,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
1,1,,,1088.973512,-526.999084,0.0,0.0,1.0,725.449097,73.245003,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
2,2,,,971.800525,-526.999084,0.0,0.0,1.0,723.088928,76.431496,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
3,3,,,874.115398,-526.999084,0.0,0.0,1.0,726.009766,79.395073,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
4,4,,,729.543196,-526.999084,0.0,0.0,1.0,728.209656,82.197769,...,1,1,RD1,CTRL,EC0,433,"(3, 4)",PS0000,433.3.4,433.3.4.PS0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722130,149,1.117423,1.095075,752.584461,55.088959,1.0,1.0,1.0,227.093262,130.358139,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1722131,150,2.033710,1.095075,616.035357,55.088959,1.0,1.0,1.0,226.622925,130.532120,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1722132,151,1.430302,1.095075,579.182743,55.088959,1.0,1.0,1.0,225.366608,129.932709,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1722133,152,1.989013,1.095075,679.147414,55.088959,1.0,1.0,1.0,226.424683,128.489410,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003


In [25]:
len((df['Unique ID'].unique()))

20552

In [26]:
df.to_pickle('temp.pkl')

# Separating dataframes so that I can integrate previous GT tracks from PS000

In [27]:
previous_df = pd.read_pickle('/mnt/SYNO/macrohet_syno/PS0000/results/preliminary_sc_measures/sc_dfs/sc_df_GT_70_area_false_outliers_removed_correct.pkl')

In [33]:
previous_df['Mphi Area'] = previous_df['Mphi Area']*pixel_to_mum_sq_scale_factor

In [44]:
previous_df['dMphi Area (µm)'] = previous_df.groupby('Unique ID')['Mphi Area (µm)'].transform(lambda x: x.iloc[-1] - x.iloc[0])

In [47]:
previous_df.rename(columns={'Mtb Area':'Mtb Area (µm)'}, inplace=True)
previous_df.rename(columns={'dMtb':'dMtb Area (µm)'}, inplace=True)
previous_df.rename(columns={'Mphi Area':'Mphi Area (µm)'}, inplace=True)
previous_df.rename(columns={'Mphi GFP':'GFP'}, inplace=True)
previous_df.rename(columns={'Mean Mtb Intensity':'RFP'}, inplace=True)

In [54]:
previous_df['Experiment ID'] = 'PS0000'

In [59]:
base_dir = f'/mnt/SYNO/macrohet_syno/PS0000'
metadata_path = glob.glob(os.path.join(base_dir, 'acquisition/Assaylayout/*.xml'))[0]
assay_layout = dataio.read_harmony_metadata(metadata_path, assay_layout=True, replicate_number = True)
assay_layout

Reading metadata XML file...
Extracting metadata complete!


Unnamed: 0_level_0,Unnamed: 1_level_0,Strain,Compound,Concentration,ConcentrationEC,Replicate #
Row,Column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,4,RD1,CTRL,0.0,EC0,1
3,5,WT,CTRL,0.0,EC0,1
3,6,WT,PZA,60.0,EC50,1
3,7,WT,RIF,0.1,EC50,1
3,8,WT,INH,0.04,EC50,1
3,9,WT,BDQ,0.02,EC50,1
4,4,RD1,CTRL,0.0,EC0,2
4,5,WT,CTRL,0.0,EC0,2
4,6,WT,PZA,60.0,EC50,2
4,7,WT,RIF,0.1,EC50,2


In [60]:
assay_layout['Acquisition ID'] = assay_layout.index.to_list()
assay_layout

Unnamed: 0_level_0,Unnamed: 1_level_0,Strain,Compound,Concentration,ConcentrationEC,Replicate #,Acquisition ID
Row,Column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,4,RD1,CTRL,0.0,EC0,1,"(3, 4)"
3,5,WT,CTRL,0.0,EC0,1,"(3, 5)"
3,6,WT,PZA,60.0,EC50,1,"(3, 6)"
3,7,WT,RIF,0.1,EC50,1,"(3, 7)"
3,8,WT,INH,0.04,EC50,1,"(3, 8)"
3,9,WT,BDQ,0.02,EC50,1,"(3, 9)"
4,4,RD1,CTRL,0.0,EC0,2,"(4, 4)"
4,5,WT,CTRL,0.0,EC0,2,"(4, 5)"
4,6,WT,PZA,60.0,EC50,2,"(4, 6)"
4,7,WT,RIF,0.1,EC50,2,"(4, 7)"


In [61]:
acquisition_to_replicate = assay_layout.set_index('Acquisition ID')['Replicate #'].to_dict()

In [63]:
previous_df['Technical Replicate'] = previous_df['Acquisition ID'].map(acquisition_to_replicate)
previous_df['Biological Replicate'] = 1

In [64]:
previous_df['Technical Replicate'] = previous_df['Acquisition ID'].map(acquisition_to_replicate)

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Strain,Compound,Concentration,Unique ID,Acquisition ID,Cell ID,Mphi Area (µm),...,RFP,Thresholded Mean Mtb Intensity,Infection Status,Initial Infection Status,Final Infection Status,"dMtb/dt,T=15","Max. dMtb/dt,T=15",dMphi Area (µm),Experiment ID,Biological Replicate
0,0,46.797680,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,660.776979,...,473.168976,167.325424,True,True,True,,6.314185,-68.386294,PS0000,1
1,1,48.719647,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,585.105086,...,498.701813,202.661972,True,True,True,,6.314185,-68.386294,PS0000,1
2,2,52.206007,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,582.020998,...,502.468353,210.382874,True,True,True,,6.314185,-68.386294,PS0000,1
3,3,50.552221,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,572.232372,...,501.905182,212.070969,True,True,True,,6.314185,-68.386294,PS0000,1
4,4,54.463202,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,590.669853,...,502.564423,204.565842,True,True,True,,6.314185,-68.386294,PS0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296992,70,0.000000,-3.888632,WT,CTRL,EC0,996.4.5,"(4, 5)",996,727.062517,...,385.325806,1.178588,True,True,False,0.0,0.064066,-186.967234,PS0000,1
296993,71,0.000000,-3.888632,WT,CTRL,EC0,996.4.5,"(4, 5)",996,701.808755,...,384.643707,1.152374,True,True,False,0.0,0.064066,-186.967234,PS0000,1
296994,72,0.000000,-3.888632,WT,CTRL,EC0,996.4.5,"(4, 5)",996,742.840531,...,384.512726,1.103884,True,True,False,0.0,0.064066,-186.967234,PS0000,1
296995,73,0.000000,-3.888632,WT,CTRL,EC0,996.4.5,"(4, 5)",996,948.580473,...,383.335663,0.893816,True,True,False,0.0,0.064066,-186.967234,PS0000,1


In [66]:
previous_df['ID'] = previous_df.apply(lambda row: f"{row['Unique ID']}.{row['Experiment ID']}", axis=1)

In [67]:
previous_df.keys(), df.keys()

(Index(['Time (hours)', 'Mtb Area (µm)', 'dMtb Area (µm)', 'Strain', 'Compound',
        'Concentration', 'Unique ID', 'Acquisition ID', 'Cell ID',
        'Mphi Area (µm)', 'GFP', 'x', 'y', 'Eccentricity', 'MSD',
        'Technical Replicate', 'Strain/Compound', 'dMtb/dt', '%dMtb/dt',
        '%dMtb', 'RFP', 'Thresholded Mean Mtb Intensity', 'Infection Status',
        'Initial Infection Status', 'Final Infection Status', 'dMtb/dt,T=15',
        'Max. dMtb/dt,T=15', 'dMphi Area (µm)', 'Experiment ID',
        'Biological Replicate', 'ID'],
       dtype='object'),
 Index(['Time (hours)', 'Mtb Area (µm)', 'dMtb Area (µm)', 'Mphi Area (µm)',
        'dMphi Area (µm)', 'Infection Status', 'Initial Infection Status',
        'Final Infection Status', 'x', 'y', 'GFP', 'RFP', 'Eccentricity', 'MSD',
        'Technical Replicate', 'Biological Replicate', 'Strain', 'Compound',
        'Concentration', 'Cell ID', 'Acquisition ID', 'Experiment ID',
        'Unique ID', 'ID'],
       dtype='object

In [84]:
# Assuming you have two DataFrames: previous_df and new_df

# Get the columns present in both DataFrames
common_columns = previous_df.columns.intersection(df.columns)

print(common_columns)
# Assuming you have two DataFrames: df and previous_df

# Get the columns in df that are not in previous_df
unique_columns = previous_df.columns.difference(df.columns)

print(unique_columns)


Index(['Time (hours)', 'Mtb Area (µm)', 'dMtb Area (µm)', 'Strain', 'Compound',
       'Concentration', 'Unique ID', 'Acquisition ID', 'Cell ID',
       'Mphi Area (µm)', 'GFP', 'x', 'y', 'Eccentricity', 'MSD',
       'Technical Replicate', 'RFP', 'Infection Status',
       'Initial Infection Status', 'Final Infection Status', 'dMphi Area (µm)',
       'Experiment ID', 'Biological Replicate', 'ID', 'tempID'],
      dtype='object')
Index(['%dMtb', '%dMtb/dt', 'Max. dMtb/dt,T=15', 'Strain/Compound',
       'Thresholded Mean Mtb Intensity', 'dMtb/dt', 'dMtb/dt,T=15'],
      dtype='object')


### Merge the data

In [83]:
previous_df[previous_df['Unique ID'] == '1.3.5']

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Strain,Compound,Concentration,Unique ID,Acquisition ID,Cell ID,Mphi Area (µm),...,Infection Status,Initial Infection Status,Final Infection Status,"dMtb/dt,T=15","Max. dMtb/dt,T=15",dMphi Area (µm),Experiment ID,Biological Replicate,ID,tempID
0,0,46.797680,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,660.776979,...,True,True,True,,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.0
1,1,48.719647,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,585.105086,...,True,True,True,,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.1
2,2,52.206007,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,582.020998,...,True,True,True,,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.2
3,3,50.552221,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,572.232372,...,True,True,True,,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.3
4,4,54.463202,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,590.669853,...,True,True,True,,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,70,176.239972,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,561.885034,...,True,True,True,3.410375,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.70
69,71,163.747182,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,561.996776,...,True,True,True,2.115654,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.71
70,72,160.417261,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,548.252472,...,True,True,True,2.146942,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.72
71,73,172.552476,136.772588,WT,CTRL,EC0,1.3.5,"(3, 5)",1,542.553614,...,True,True,True,3.075148,6.314185,-68.386294,PS0000,1,1.3.5.PS0000,1.3.5.PS0000.73


In [73]:
df[df['Unique ID'] == '1.3.5']

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Technical Replicate,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID
35173,0,,,660.776979,-68.386294,1.0,1.0,1.0,519.922607,876.779602,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35174,1,,,585.105086,-68.386294,1.0,1.0,1.0,522.290833,876.766357,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35175,2,,,582.020998,-68.386294,1.0,1.0,1.0,524.336243,874.563110,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35176,3,,,572.232372,-68.386294,1.0,1.0,1.0,516.952454,876.656799,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35177,4,,,590.669853,-68.386294,1.0,1.0,1.0,521.947449,880.909363,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35241,70,,,561.885034,-68.386294,1.0,1.0,1.0,526.011353,846.571777,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35242,71,,,561.996776,-68.386294,1.0,1.0,1.0,524.708801,847.689758,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35243,72,,,548.252472,-68.386294,1.0,1.0,1.0,524.800598,849.129272,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000
35244,73,,,542.553614,-68.386294,1.0,1.0,1.0,524.987671,848.533813,...,1,1,WT,CTRL,EC0,1,"(3, 5)",PS0000,1.3.5,1.3.5.PS0000


In [81]:
previous_df['tempID'] = previous_df.apply(lambda row: f"{row['ID']}.{row['Time (hours)']}", axis=1)
df['tempID'] = df.apply(lambda row: f"{row['ID']}.{row['Time (hours)']}", axis=1)

In [87]:
previous_df[['Mtb Area (µm)', 'dMtb Area (µm)', 'tempID']]

Unnamed: 0,Mtb Area (µm),dMtb Area (µm),tempID
0,46.797680,136.772588,1.3.5.PS0000.0
1,48.719647,136.772588,1.3.5.PS0000.1
2,52.206007,136.772588,1.3.5.PS0000.2
3,50.552221,136.772588,1.3.5.PS0000.3
4,54.463202,136.772588,1.3.5.PS0000.4
...,...,...,...
296992,0.000000,-3.888632,996.4.5.PS0000.70
296993,0.000000,-3.888632,996.4.5.PS0000.71
296994,0.000000,-3.888632,996.4.5.PS0000.72
296995,0.000000,-3.888632,996.4.5.PS0000.73


In [103]:
correct_IDs = previous_df['tempID'].values

In [104]:
correct_IDs

array(['1.3.5.PS0000.0', '1.3.5.PS0000.1', '1.3.5.PS0000.2', ...,
       '996.4.5.PS0000.72', '996.4.5.PS0000.73', '996.4.5.PS0000.74'],
      dtype=object)

In [89]:
ps_df = df[df['Experiment ID'] == 'PS0000']

In [106]:
ps_df = ps_df[ps_df['tempID'].isin(correct_IDs)]

In [108]:
ps_df = ps_df.merge(previous_df[['tempID', 'Mtb Area (µm)', 'dMtb Area (µm)']],
                    on='tempID', 
                    how='left',
                    suffixes=('', '_from_previous'))

In [109]:
ps_df['Mtb Area (µm)'] = ps_df['Mtb Area (µm)_from_previous'].fillna(ps_df['Mtb Area (µm)'])
ps_df['dMtb Area (µm)'] = ps_df['dMtb Area (µm)_from_previous'].fillna(ps_df['dMtb Area (µm)'])

In [112]:
ps_df.drop(['Mtb Area (µm)_from_previous', 'dMtb Area (µm)_from_previous', 'tempID'], axis=1, inplace=True)

In [119]:
ps_df

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Technical Replicate,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID
0,0,0.424621,-0.424621,459.082108,-92.746118,1.0,1.0,0.0,457.172943,8.791715,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
1,1,0.000000,-0.424621,372.839393,-92.746118,0.0,1.0,0.0,459.138947,6.718642,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
2,2,0.000000,-0.424621,423.168130,-92.746118,0.0,1.0,0.0,460.555237,10.785886,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
3,3,0.335227,-0.424621,445.561289,-92.746118,1.0,1.0,0.0,455.878815,11.143067,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
4,4,0.000000,-0.424621,487.040034,-92.746118,1.0,1.0,0.0,462.634186,14.050420,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291521,70,5.028404,4.559086,2279.341997,2028.994524,1.0,0.0,1.0,628.562500,1099.782104,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000
291522,71,5.564767,4.559086,2256.703005,2028.994524,1.0,0.0,1.0,628.463684,1102.144409,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000
291523,72,5.274237,4.559086,2422.506245,2028.994524,1.0,0.0,1.0,628.672058,1101.594727,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000
291524,73,5.140146,4.559086,2393.050972,2028.994524,1.0,0.0,1.0,630.372803,1102.817749,...,2,1,WT,BDQ,EC99,935,"(6, 9)",PS0000,935.6.9,935.6.9.PS0000


In [120]:
final_df = pd.concat([ps_df, df[df['Experiment ID'] != 'PS0000']], axis=0)

In [123]:
final_df

Unnamed: 0,Time (hours),Mtb Area (µm),dMtb Area (µm),Mphi Area (µm),dMphi Area (µm),Infection Status,Initial Infection Status,Final Infection Status,x,y,...,Technical Replicate,Biological Replicate,Strain,Compound,Concentration,Cell ID,Acquisition ID,Experiment ID,Unique ID,ID
0,0,0.424621,-0.424621,459.082108,-92.746118,1.0,1.0,0.0,457.172943,8.791715,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
1,1,0.000000,-0.424621,372.839393,-92.746118,0.0,1.0,0.0,459.138947,6.718642,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
2,2,0.000000,-0.424621,423.168130,-92.746118,0.0,1.0,0.0,460.555237,10.785886,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
3,3,0.335227,-0.424621,445.561289,-92.746118,1.0,1.0,0.0,455.878815,11.143067,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
4,4,0.000000,-0.424621,487.040034,-92.746118,1.0,1.0,0.0,462.634186,14.050420,...,1,1,RD1,CTRL,EC0,426,"(3, 4)",PS0000,426.3.4,426.3.4.PS0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722130,149,1.117423,1.095075,752.584461,55.088959,1.0,1.0,1.0,227.093262,130.358139,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1722131,150,2.033710,1.095075,616.035357,55.088959,1.0,1.0,1.0,226.622925,130.532120,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1722132,151,1.430302,1.095075,579.182743,55.088959,1.0,1.0,1.0,225.366608,129.932709,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003
1722133,152,1.989013,1.095075,679.147414,55.088959,1.0,1.0,1.0,226.424683,128.489410,...,2,3,RD1,BDQ,EC99,1874,"(6, 12)",ND0003,1874.6.12,1874.6.12.ND0003


In [124]:
final_df.to_pickle('sc_df.pkl')