# Data Processing Protocol

This is the second step in the data processing pipeline. After having extracted all the CTL positions with the `image-analysis` code, we now extract the relevant information from the tracks of the CTLs in the wells to look at the accumulation.

This notebook serves to mix data the CTL positional data with the fragmentation data and to generate the pandas dataframes upon which the analysis rests.

First load the Excel sheet where we manually entered the final state of the spheroid (fragmented/not fragmented) for every spheroid.

In [None]:
FNAME = 'MIOCS_record_fragmentation.xlsx'

death_excel = pandas.read_excel(FNAME)

Then enter the path to the folder containing the CSV generated by the `image-analysis` script. The CSV has to be named `ot1_frame.csv`.

Before executing the function below, execute the cells at the bottom of the notebook containing the relevant scripts :)

In [None]:
binExperiments = []

PATH_TO_CSV = './expected_segmentation_output'
SAVE_PATH = './expected_postprocessing_output'
experiment_date = 20201113
matrigel_concentration = 2.2
ratio = 0.33
time_ratio = 0.5
max_time = 14*30
do_death = True

prepare_CTL_data(PATH_TO_CSV,
                 SAVE_PATH,
                 binExperiments,
                 experiment_date,
                 matrigel_concentration,
                 ratio,
                 time_ratio,
                 max_time,
                 death_excel,
                 do_death)

## Utility functions

Execute the cell below to launch the CTL position post-processing protocol.

In [None]:
import pandas
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import trackpy

def _binExperiments(dataFrame, expToBin):
    
    """
    Remove wells with faulty segmentation.

    """
    
    for exp in expToBin:
        
        dataFrame = dataFrame[dataFrame['m'] != int(exp)]
        
    return dataFrame

def fill_missing_position(dataframe):
    
    """
    for each time point, if missing take closest non missing position 
    and fill it in. (Spheroids don't dissappear...)
    """
    
    for frame in dataframe['frame'].unique():
        
        if dataframe.loc[dataframe['frame'] == frame, 'spheroid_radius'].unique() == [0]:
                        
            # find closest non null cell
            idx = np.abs(dataframe.loc[dataframe['spheroid_radius'] > 0, 'frame'] - frame).idxmin()
            
            dataframe.loc[dataframe['frame'] == frame, 'spheroid_radius'] = dataframe.loc[idx, 'spheroid_radius']
            dataframe.loc[dataframe['frame'] == frame, 'spheroid_center_x'] = dataframe.loc[idx, 'spheroid_center_x']
            dataframe.loc[dataframe['frame'] == frame, 'spheroid_center_y'] = dataframe.loc[idx, 'spheroid_center_y']
    
    return dataframe

def fill_spheroid_positions(data_frame):
    
    """
    Fix spheroid position (spheroids don't drift over time course of experiment) 
    """
    
    for ID in tqdm(data_frame['ID'].unique()):
                        
        loc_frame = data_frame[data_frame['ID'] == ID]
        #loc_frame = fill_missing_position(loc_frame)
        
        index_max = loc_frame.loc[loc_frame['frame'] < 400, 'spheroid_radius'].idxmax()
        loc_frame['spheroid_radius'] = loc_frame.loc[loc_frame['frame'] < 400, 'spheroid_radius'].max()
        loc_frame['spheroid_center_x'] = loc_frame.loc[index_max, 'spheroid_center_x']
        loc_frame['spheroid_center_y'] = loc_frame.loc[index_max, 'spheroid_center_y']
        
        data_frame[data_frame['ID'] == ID] = loc_frame
        
    return data_frame

def get_state(well_frame: pandas.DataFrame, 
    radius: int):

    """
    
    Get the state of the CTL cells (contact/no contact)
    as a function of the distance to border of the sph.

    This is approximated by the radius of the equivalent
    disk.

    Returns:
     - pandas.DataFrame
    
    """

    state = ((well_frame["x pos"] - well_frame["spheroid_center_x"]) ** 2 + 
            (well_frame["y pos"] - well_frame["spheroid_center_y"]) ** 2) < (radius + well_frame["spheroid_radius"]) ** 2

    well_frame["state"] = state

    return well_frame

a = pandas.read_excel(r'/Users/gustaveronteix/Documents/Research/Pasteur/SpheroidPositionAnalysis/CD8Tracking_matrigel.xlsx',engine="openpyxl", sheet_name=1)

def fill_death(dataframe, a, row_var = 'm'):
    
    date = dataframe['Date'].unique()[0]
    
    aloc = a[a['Date'] == date]
        
    for m in dataframe[row_var].unique():
        
        loc = aloc[aloc['Well number'] == m]
                        
        dataframe.loc[dataframe[row_var] == m, 'Death'] = int((loc['spheroid death in end ?\nyes/no'].iloc[0] == 'yes') or 
                                                              (loc['spheroid death in end ?\nyes/no'].iloc[0] == 'Yes'))
    
    return dataframe

def prepare_CTL_data(PATH:str,
                     SAVEPATH:str,
                     binExperiments:list,
                     Date:int,
                     matrigel:float,
                     ratio:float,
                     freq:float,
                     max_frame:int,
                     a:pandas.DataFrame,
                     do_Death:bool):
    
    # Save names
    save_analysis = os.path.join(SAVEPATH, f'deadFrame_{int(1/freq)}min_{Date}.csv')
    save_comp = os.path.join(SAVEPATH, f'endPointFrame_{int(1/freq)}min_{Date}.csv')

    # Load spheroid data frame
    original_frame = pandas.read_csv(os.path.join(PATH, 'ot1_frame.csv'))
    Sph_file = _binExperiments(original_frame, binExperiments)
    
    Sph_file['Date'] = Date
    Sph_file['matrigel'] = matrigel
    Sph_file['ratio'] = ratio
    Sph_file['frames/min'] = freq
    Sph_file['x pos'] = Sph_file['x']
    Sph_file['y pos'] = Sph_file['y']
    Sph_file['x'] = Sph_file['x']*Sph_file['ratio']
    Sph_file['y'] = Sph_file['y']*Sph_file['ratio']
    Sph_file['ID'] = Sph_file['Date'].astype(str) + ' : ' + Sph_file['m'].astype(str)

    # Prepare spheroid positions
    Sph_file = fill_spheroid_positions(Sph_file)
    Sph_file = get_state(Sph_file, 15)
    
    loc_frame = pandas.DataFrame()
    
    # Start if
    
    if 'particle' not in Sph_file.columns:
    
        for m in Sph_file['m'].unique():

            loc_frame = loc_frame.append(trackpy.link(Sph_file.loc[Sph_file['m'] == m], 
                                                                search_range = 15,
                                                                memory = 2))
        Sph_file = loc_frame
        
    # End if
        
    # Make analysis frame
    AnalysisFrame = pandas.DataFrame()
    i  = 0     
    
    for exp in tqdm(Sph_file['m'].unique()):
        
        
        
        expFrame = Sph_file[Sph_file['m']==exp]        
        wellNo = str(exp)        
        
        arr = expFrame.pivot(index = 'frame', columns = 'particle', values = 'state')
        arr = arr.fillna(0).astype(int)
                
        values = sorted((arr - arr.shift(1).fillna(0)).idxmax().unique())
        
        for time in expFrame['frame'].unique():
            
            cellFrame = expFrame[expFrame['frame']==time]
            
            AnalysisFrame.loc[i, 'matrigel'] = cellFrame['matrigel'].iloc[0]
            AnalysisFrame.loc[i, 'experiment'] = exp
            AnalysisFrame.loc[i, 'frame'] = time
            AnalysisFrame.loc[i, 'N contact'] = len(cellFrame[cellFrame['state'] == 1])
            AnalysisFrame.loc[i, 'N'] = len(cellFrame)
            AnalysisFrame.loc[i, 'sph Radius'] = cellFrame['spheroid_radius'].iloc[0]
            AnalysisFrame.loc[i, 'ID'] = str(cellFrame['Date'].iloc[0]) + ' : ' + str(exp)
            AnalysisFrame.loc[i, 'Death'] = 0
            
            
            if len(values) > 4:
                
                AnalysisFrame.loc[i, 'Contact 1'] = values[1]
                AnalysisFrame.loc[i, 'Contact 2'] = values[2]
                AnalysisFrame.loc[i, 'Contact 3'] = values[3]
            
            if len(values) > 3:
                
                AnalysisFrame.loc[i, 'Contact 1'] = values[1]
                AnalysisFrame.loc[i, 'Contact 2'] = values[2]
                AnalysisFrame.loc[i, 'Contact 3'] = values[3]
                
            if len(values) > 2:
                
                AnalysisFrame.loc[i, 'Contact 1'] = values[1]
                AnalysisFrame.loc[i, 'Contact 2'] = values[2]
                
            if len(values) > 1:
                
                AnalysisFrame.loc[i, 'Contact 1'] = values[1]
            
            i+=1
    AnalysisFrame.to_csv(save_analysis)

    #### COMPARISON FRAME ####
    AnalysisFrame = AnalysisFrame[AnalysisFrame['frame'] < max_frame]

    comparisonFrame = pandas.DataFrame()
    j = 0

    for ID in AnalysisFrame['ID'].unique():

        loc = AnalysisFrame[AnalysisFrame['ID'] == ID]  

        comparisonFrame.loc[j, 'N_auto'] = np.mean(loc['N contact'].max())
        comparisonFrame.loc[j, 'N_final'] = loc.loc[loc['frame'] == 370 , 'N contact'].max()
        comparisonFrame.loc[j, 'N_tot'] = np.mean(loc['N'].max())
        
        idx_Ntot = loc['N'].idxmax()
        idx_Nauto = loc['N contact'].idxmax()
        
        if loc['N contact'].max() == 0:
            idx_contact_1 = max(loc.index)
        else:  
            idx_contact_1 = min(loc[loc['N contact'] > 0].index)

        comparisonFrame.loc[j, 'N_tot_frame'] = loc.loc[idx_Ntot, 'frame']
        comparisonFrame.loc[j, 'N_auto_frame'] = loc.loc[idx_Nauto, 'frame']
    

        try:
            sphArea = (np.pi*loc['sph Radius']**2).unique()[1]
        except:
            sphArea = (np.pi*loc['sph Radius']**2).unique()[0]

        comparisonFrame.loc[j, 'sph area'] = sphArea

        comparisonFrame.loc[j, 'Contact_1_idx'] = loc.loc[idx_contact_1, 'frame']
        comparisonFrame.loc[j, 'Contact 1'] = loc['Contact 1'].unique()[0]
        comparisonFrame.loc[j, 'Contact 2'] = loc['Contact 2'].unique()[0]
        comparisonFrame.loc[j, 'Contact 3'] = loc['Contact 3'].unique()[0]

        comparisonFrame.loc[j, 'Death'] = loc['Death'].iloc[0]
        comparisonFrame.loc[j, 'ID'] = ID

        j += 1

    comparisonFrame['Date'] = comparisonFrame['ID'].str.split(' : ', expand = True)[0].astype(int)
    comparisonFrame['m'] = comparisonFrame['ID'].str.split(' : ', expand = True)[1].astype(int)
    
    if do_Death:
        comparisonFrame = fill_death(comparisonFrame, a, row_var = 'm')

    comparisonFrame.to_csv(save_comp)
                     
    return True
