## Feature processing for timeseries analysis

- This notebook is focused on analyzing the mean behavior of the library, over time

- It outputs a timeseries dataframe and a steady state dataframe

- Most steady state analysis is in another notebook, however

In [1]:
import trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import scipy as sp
import sklearn as skl
import dask.dataframe as dd

import scipy.stats
import dask
import warnings
import random

from dask.distributed import wait
from statsmodels.nonparametric import kernel_regression

from matplotlib import pyplot as plt

dask_sample_seed = 42
random.seed(42)
np.random.seed(42)

dask_wd = "/home/de64/scratch/de64/dask"
warnings.filterwarnings(action='once',category=UserWarning)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
def get_l_norm(x,right_tail_only=False):
    if right_tail_only:
        x_median = np.nanmedian(x)
        x_vals = x[x>x_median]
    else:
        x_vals = x
    l_norm = sp.stats.yeojohnson_normmax(x_vals)
    return l_norm

def apply_transform(final_output_df_filtered,yeo_subsample = 1,dask_sample_seed=dask_sample_seed,early_time_cutoff=28800,steady_state_time_cutoff=64800,clipping_range=(-4,4),\
                    params_to_transform = ['Division: major_axis_length'], transform_list=["YJ-Right"], time_label="time (s)",trenchid_label="Multi-Experiment Phenotype Trenchid",\
                   gmm_scaling=True):
    
    subsample_df = final_output_df_filtered.sample(frac=yeo_subsample, random_state = dask_sample_seed).persist()
    subsample_df_final = subsample_df[subsample_df[time_label]>=steady_state_time_cutoff]
    subsample_df_init = subsample_df[subsample_df[time_label]<early_time_cutoff]
    for i,param in enumerate(params_to_transform):
        print(param)
        transform = transform_list[i]
        param_vals_final = subsample_df_final[param].astype(float).compute().tolist()
        param_vals_final = np.array(param_vals_final)
        param_vals_final = param_vals_final[~np.isnan(param_vals_final)]

        param_vals_init = subsample_df_init[param].astype(float).compute().tolist()
        param_vals_init = np.array(param_vals_init)
        param_vals_init = param_vals_init[~np.isnan(param_vals_init)]

        ### extract high variance GMM component
        # Fit a Gaussian Mixture Model with two components
        X = param_vals_final[:,np.newaxis]
        if gmm_scaling:
            gmm = skl.mixture.GaussianMixture(n_components=2, random_state=dask_sample_seed)
            gmm.fit(X)
            
            covars = gmm.covariances_[:,0,0]
            max_var_component = np.argmax(covars)
                   
            probs = gmm.predict_proba(X)
            weights = probs[:, max_var_component]
            ### generate a weighted sample from this component
            weighted_sample = np.random.choice(X.flatten(), size=len(X), p=weights/weights.sum())
        else:
            weighted_sample = X.flatten()

        if transform == "YJ" or transform == "YJ-Right":
            if transform == "YJ":
                l_norm = get_l_norm(weighted_sample,right_tail_only=False)
            elif transform == "YJ-Right":
                l_norm = get_l_norm(weighted_sample,right_tail_only=True)
            final_output_df_filtered[param + ": Transformed"] = final_output_df_filtered[param].apply(lambda x: sp.stats.yeojohnson(x,lmbda = l_norm), meta=(param + ": Transformed",float)).persist()
            transformed_weighted_sample = sp.stats.yeojohnson(weighted_sample,lmbda=l_norm)
            transformed_initial_values = sp.stats.yeojohnson(param_vals_init,lmbda=l_norm)
        elif transform == "None":
            final_output_df_filtered[param + ": Transformed"] = final_output_df_filtered[param]
            transformed_weighted_sample = weighted_sample
            transformed_initial_values = param_vals_init

        weighted_sigma = np.std(transformed_weighted_sample)
        median_val = np.median(transformed_initial_values)
        feature_scores = (1.35*((final_output_df_filtered[param + ": Transformed"] - median_val)/weighted_sigma))
        feature_scores = np.clip(feature_scores,clipping_range[0],clipping_range[1])
        final_output_df_filtered[param + ": Transformed: z score"] = feature_scores.persist()
        
    final_output_df_filtered = final_output_df_filtered.reset_index().set_index(trenchid_label,drop=True,sorted=True)
    return final_output_df_filtered

def timeseries_kernel_reg(df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth, nan_filter, time_label="Final time (s)", trenchid_label="Multi-Experiment Phenotype Trenchid"):
    def kernel_reg(x_arr,y_arr,nan_filter,start=min_tpt,end=max_tpt,kernel_bins=kernel_bins,kernel_bandwidth=kernel_bandwidth):
        intervals = np.linspace(start, end, num=kernel_bins, dtype=float)
        try:
            if nan_filter:
                nan_mask = ~np.isnan(y_arr)
                y_arr = y_arr[nan_mask]
                x_arr = x_arr[nan_mask]
            if len(x_arr)>0:
                w = kernel_regression.KernelReg(y_arr,x_arr,"c",reg_type="lc",bw=np.array([kernel_bandwidth]),ckertype="gaussian").fit(intervals)[0]
                reg_x, reg_y = (intervals, w)
            else:
                reg_x, reg_y = (intervals, [np.NaN for idx in range(len(intervals))])
        except:
            reg_x, reg_y = (intervals, [np.NaN for idx in range(len(intervals))])
        return reg_x, reg_y
    kernel_result = df.groupby(trenchid_label).apply(lambda x: list(kernel_reg(x[time_label].values,x[y_label].values,nan_filter)[1]),meta=(y_label, object),)
    return kernel_result


def get_all_kernel_regs(df, y_label_list, nan_filter_list=[], min_tpt = 0, max_tpt = 36000, kernel_bins=20, kernel_bandwidth = 7200, time_label="Cell Cycle", trenchid_label="Multi-Experiment Phenotype Trenchid"):
    out_df = []
    for y_label in y_label_list:
        sub_df = df[[y_label,time_label]]
        
        if y_label in nan_filter_list:
            kernel_result = timeseries_kernel_reg(sub_df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth, True, time_label=time_label, trenchid_label=trenchid_label)
        else:
            kernel_result = timeseries_kernel_reg(sub_df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth, False, time_label=time_label, trenchid_label=trenchid_label)
        kernel_result = kernel_result
        kernel_result.name = "Kernel Trace: " + y_label
        out_df.append(kernel_result)
    out_df = dd.concat(out_df, axis=1)
    return out_df

def explode_kernel_trace(series_x):
    output = series_x.apply(lambda x: eval(x)).explode().to_frame()
    return output

def agg_sem(x,vector):
    arr = np.array([eval(item) for item in x[vector].tolist()])
    sem_arr = sp.stats.sem(arr, axis=0, nan_policy='propagate')
    sem_arr[np.isnan(sem_arr)] = -1
    output = sem_arr.tolist()
    return output

def unpack_sem(x):
    sem_arr = np.array(eval(x))
    sem_arr[sem_arr==-1] = np.NaN
    return sem_arr

## need a jankknife mean function
def agg_sem_jackknife(x):
    sem_arr = sp.stats.sem(np.array(x.values.tolist()),axis=0,nan_policy='propagate')
    sem_arr[np.isnan(sem_arr)] = -1
    output = sem_arr.tolist()
    return output

def unpack_sem_jackknife(x):
    sem_arr = np.array(eval(x))
    sem_arr[sem_arr==-1] = np.NaN
    return sem_arr

def get_jackknife_mean(jackknife_df_temp_path,jackknife_i,jackknife_ttl):
    jackknife_df = pd.read_pickle(jackknife_df_temp_path)
    indices_to_drop = jackknife_df.index[jackknife_i::jackknife_ttl]
    jackknife_df = jackknife_df.drop(indices_to_drop)
    single_jackknife = jackknife_df.groupby("oDEPool7_id")["Feature Vector"].apply(lambda x: np.mean(np.array(x.values.tolist()),axis=0).tolist())
    single_jackknife = single_jackknife.to_frame()
    jackknife_sem = jackknife_df.groupby("oDEPool7_id")["Feature Vector"].apply(lambda x: agg_sem_jackknife(x))    
    single_jackknife["SEM: Feature Vector"] = jackknife_sem
    single_jackknife["Jackknife"] = jackknife_i
    return single_jackknife

def get_mean_var_and_cv(df,final_columns,columns_to_apply_aggregation,sgrna_key='oDEPool7_id'):
    sgrna_sorted = df.reset_index(drop=True).set_index(sgrna_key).persist()
    sgrna_groupby = sgrna_sorted.groupby(sgrna_key)
    mean_df = sgrna_groupby[columns_to_apply_aggregation].apply(lambda x: np.mean(x,axis=0)).compute()
    std_df = sgrna_groupby[columns_to_apply_aggregation].apply(lambda x: np.std(x,axis=0,ddof=1)).compute()
    cv_df = std_df/mean_df
    var_df = sgrna_groupby[columns_to_apply_aggregation].apply(lambda x: np.var(x,axis=0)).compute()
    count_df = sgrna_groupby[columns_to_apply_aggregation].apply(lambda x: x.count()).compute()
    sem_df = std_df/np.sqrt(count_df)
        
    mean_df = mean_df.rename(columns={key: "Mean: " + key for key in mean_df.columns.tolist()})
    var_df = var_df.rename(columns={key: "Variance: " + key for key in var_df.columns.tolist()})
    cv_df = cv_df.rename(columns={key: "CV: " + key for key in cv_df.columns.tolist()})
    sem_df = sem_df.rename(columns={key: "SEM: " + key for key in sem_df.columns.tolist()})
    
    output_df = sgrna_groupby.first().compute()
    output_df = output_df[final_columns]
    output_df = output_df.join(mean_df)
    output_df = output_df.join(var_df)
    output_df = output_df.join(cv_df)
    output_df = output_df.join(sem_df)
    
    return output_df

### Initial Data Processing (Mean Timeseries)

#### Start Dask

In [3]:
# dask_controller = tr.trcluster.dask_controller(
#     walltime="04:00:00",
#     local=False,
#     n_workers=300,
#     n_workers_min=200,
#     memory="16GB",
#     working_directory=dask_wd,
# )
# dask_controller.startdask()

dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=100,
    n_workers_min=100,
    memory="16GB",
    working_directory=dask_wd,
)
dask_controller.startdask()

230m
04:00:00


In [4]:
dask_controller.displaydashboard()

### Kernel regression on each trench

In [10]:
partition_size = "500MB"

cell_cycle_df = dd.read_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Lineage_Cell_Cycle_Merged", engine="pyarrow",calculate_divisions=True)
timepoints_df = dd.read_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Lineage_Observations_Merged", engine="pyarrow",calculate_divisions=True)
growth_df = dd.read_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Lineage_Growth_Observations_Merged", engine="pyarrow",calculate_divisions=True)

cell_cycle_df = cell_cycle_df.repartition(partition_size = partition_size)
timepoints_df = timepoints_df.repartition(partition_size = partition_size)
growth_df = growth_df.repartition(partition_size = partition_size)

In [11]:
cell_cycle_params = ['Birth: Length', 'Division: Length',
   'Delta: Length', 'Birth: Width',
   'Division: Width', 'Delta: Width',
   'Birth: Volume', 'Division: Volume', 'Delta: Volume',
   'Final timepoints', 'Delta Timepoints', 'Final time (s)',
   'Delta time (s)','Septum Displacement','Septum Displacement Length Normalized']
timepoints_params = ['area','Length', 'Width', 'Volume','mCherry mean_intensity']
growth_params = ['Instantaneous Growth Rate: Volume', 'Instantaneous Growth Rate: Length']
all_traced_params = cell_cycle_params + timepoints_params + growth_params
all_traced_params = ["Kernel Trace: " + item for item in all_traced_params]

bandwidth_hours = 2
kernel_bins = 20

cell_cycle_kernel_df = get_all_kernel_regs(
    cell_cycle_df,
    cell_cycle_params,
    min_tpt = 0,
    max_tpt = 36000,
    kernel_bins = kernel_bins,
    kernel_bandwidth = bandwidth_hours*3600,
    time_label="Mid-Cycle time (s)",
    trenchid_label="Multi-Experiment Phenotype Trenchid")
cell_cycle_kernel_df = cell_cycle_kernel_df.persist()
wait(cell_cycle_kernel_df);

timepoints_kernel_df = get_all_kernel_regs(
    timepoints_df,
    timepoints_params,
    min_tpt = 0,
    max_tpt = 36000,
    kernel_bins = kernel_bins,
    kernel_bandwidth = bandwidth_hours*3600,
    time_label="Observation time (s)",
    trenchid_label="Multi-Experiment Phenotype Trenchid")
timepoints_kernel_df = timepoints_kernel_df.persist()
wait(timepoints_kernel_df);

growth_kernel_df = get_all_kernel_regs(
    growth_df,
    growth_params,
    min_tpt = 0,
    max_tpt = 36000,
    kernel_bins = kernel_bins,
    kernel_bandwidth = bandwidth_hours*3600,
    time_label="Measurement time (s)",
    trenchid_label="Multi-Experiment Phenotype Trenchid")
growth_kernel_df = growth_kernel_df.persist()
wait(growth_kernel_df);

In [12]:
## nan filter
cell_cycle_kernel_nan_filter = cell_cycle_kernel_df.apply(lambda x: \
                            ~np.any(["nan" in item for item in x.tolist()]), axis=1, meta=bool).persist()
cell_cycle_kernel_df_filtered = cell_cycle_kernel_df[cell_cycle_kernel_nan_filter]
filtered_trenchid_indices = cell_cycle_kernel_df_filtered.index.unique().compute()
timepoints_kernel_df_filtered = timepoints_kernel_df.loc[filtered_trenchid_indices]
growth_kernel_df_filtered = growth_kernel_df.loc[filtered_trenchid_indices]



#### Merge Timeseries dfs

In [13]:
trenchid_label="Multi-Experiment Phenotype Trenchid"

cell_cycle_single_trenchid = cell_cycle_df.groupby(trenchid_label).first()
timepoints_single_trenchid = timepoints_df.groupby(trenchid_label).first()
growth_out_single_trenchid = growth_df.groupby(trenchid_label).first()

cell_cycle_out = cell_cycle_kernel_df_filtered.join(cell_cycle_single_trenchid)
timepoints_out = timepoints_kernel_df_filtered.join(timepoints_single_trenchid)
growth_out = growth_kernel_df_filtered.join(growth_out_single_trenchid)

traced_col_names = [col_name for col_name in cell_cycle_out.columns.tolist() if col_name in all_traced_params]
cell_cycle_out = cell_cycle_out[['Global CellID', 'File Parquet Index', 'fov', 'row', 'trench',
       'initial timepoints', 'File Index', 'File Trench Index', 'CellID',
       'Trench Score', 'Mother CellID', 'Daughter CellID 1',
       'Daughter CellID 2', 'Sister CellID', 'Centroid X', 'Centroid Y',
       'Kymograph File Parquet Index', 'Kymograph FOV Parquet Index',
       'FOV Parquet Index', 'Experiment #', 'trenchid'] + traced_col_names]

traced_col_names = [col_name for col_name in timepoints_out.columns.tolist() if col_name in all_traced_params]
timepoints_out = timepoints_out[traced_col_names]

traced_col_names = [col_name for col_name in growth_out.columns.tolist() if col_name in all_traced_params]
growth_out = growth_out[traced_col_names]

timeseries_merged_dd = dd.concat([cell_cycle_out,timepoints_out,growth_out], axis=1)
timeseries_merged_dd = timeseries_merged_dd.repartition(partition_size=partition_size).persist()
wait(timeseries_merged_dd);

#### Export Kernel Regression Dataframe

In [17]:
kernel_trace_columns = [column_name for column_name in timeseries_merged_dd.columns if "Kernel Trace" in column_name]
explode_series_list = []
for column_name in kernel_trace_columns:
    empty_df_ex = pd.DataFrame(data=[],columns=["Multi-Experiment Phenotype Trenchid",column_name]).astype({"Multi-Experiment Phenotype Trenchid":int,\
                                                column_name:float}).set_index("Multi-Experiment Phenotype Trenchid")
    working_exploded_column = timeseries_merged_dd[column_name].map_partitions(explode_kernel_trace,meta=empty_df_ex)
    explode_series_list.append(working_exploded_column)
kernel_output_df = dd.concat(explode_series_list, axis=1).persist()
wait(kernel_output_df);

n_indices = len(timeseries_merged_dd)
first_kernel_trace_column = [column_name for column_name in timeseries_merged_dd.columns if "Kernel Trace" in column_name][0]
timeseries_len = len(eval(timeseries_merged_dd[first_kernel_trace_column].get_partition(0).compute().iloc[0]))
timepoint_index = np.tile(range(timeseries_len),n_indices)
kernel_output_df = tr.add_list_to_column(kernel_output_df,list(timepoint_index),"Kernel Timepoints").persist()
wait(kernel_output_df);

kernel_output_df = kernel_output_df.reset_index()
kernel_output_df["Multi-Experiment Phenotype Trenchid-Kernel Timepoint Index"] =  kernel_output_df.apply(lambda x: int(f'{x["Multi-Experiment Phenotype Trenchid"]:010n}{x["Kernel Timepoints"]:03n}'), axis=1, meta=(None,'int64')).persist()
kernel_output_df = kernel_output_df.set_index("Multi-Experiment Phenotype Trenchid-Kernel Timepoint Index",sorted=True)
wait(kernel_output_df);

kernel_output_df.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Kernel_Regression_df",\
                            engine="pyarrow",schema='infer',overwrite=True)
dask_controller.daskclient.cancel(kernel_output_df)

In [39]:
strain_dict = {"ftsN":list(range(16)),\
              "glyQ":list(range(16,32)),\
              "rplA":list(range(32,48)),\
              "dnaA":list(range(48,64)),\
              "mrdA":list(range(64,80)),\
              "EV":list(range(80,96))}
inv_strain_dict = {item:key for key,val in strain_dict.items() for item in val}
gene_dict = {"ftsN":9586,\
             "glyQ":7113,\
             "rplA":4754,\
             "dnaA":8869,\
             "mrdA":9865,\
             "EV":29672}

timeseries_merged_df = timeseries_merged_dd.compute()
timeseries_merged_df["Gene"] = timeseries_merged_df["fov"].apply(lambda x: inv_strain_dict[x])
timeseries_merged_df["oDEPool7_id"] = timeseries_merged_df["Gene"].apply(lambda x: gene_dict[x])

n_obs = timeseries_merged_df.groupby("oDEPool7_id",sort=False).apply(lambda x: len(x.index.unique()), meta=int).compute()
n_obs = pd.DataFrame(n_obs).rename({0:"N Observations"}, axis=1).sort_index()
timeseries_merged_df = timeseries_merged_df.merge(n_obs,on="oDEPool7_id",how='inner',right_index=True)

#### Aggregate over timeseries

In [70]:
def agg_sem(x,vector):
    arr = np.array([eval(item) for item in x[vector].tolist()])
    sem_arr = sp.stats.sem(arr, axis=0, nan_policy='propagate')
    sem_arr[np.isnan(sem_arr)] = -1
    output = sem_arr.tolist()
    return output

def unpack_sem(x):
    sem_arr = np.array(x)
    sem_arr[sem_arr==-1] = np.NaN
    return sem_arr

In [72]:
## params
vectors_to_aggregate = all_traced_params

timeseries_merged_df_sgRNA_sorted = timeseries_merged_df.set_index("oDEPool7_id").sort_index()
timeseries_merged_df_sgRNA_sorted_groupby = timeseries_merged_df_sgRNA_sorted.groupby(["oDEPool7_id"])
    
sgRNA_df_merged = timeseries_merged_df_sgRNA_sorted_groupby.first()

for vector in vectors_to_aggregate:
    print(vector)
    mean_vectors = timeseries_merged_df_sgRNA_sorted_groupby.apply(lambda x: \
            np.mean(np.array([eval(item) for item in x[vector].tolist()]), axis=0))
    sgRNA_df_merged[vector] = mean_vectors
    
    sem_vectors = timeseries_merged_df_sgRNA_sorted_groupby.apply(lambda x: \
            agg_sem(x,vector))
    sem_vectors = sem_vectors.apply(lambda x: unpack_sem(x))
    sgRNA_df_merged["SEM: " + vector] = sem_vectors

sgRNA_df_merged.to_pickle("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_sgRNA_Timeseries_df.pkl")

Kernel Trace: Birth: Length
Kernel Trace: Division: Length
Kernel Trace: Delta: Length
Kernel Trace: Birth: Width
Kernel Trace: Division: Width
Kernel Trace: Delta: Width
Kernel Trace: Birth: Volume
Kernel Trace: Division: Volume
Kernel Trace: Delta: Volume
Kernel Trace: Final timepoints
Kernel Trace: Delta Timepoints
Kernel Trace: Final time (s)
Kernel Trace: Delta time (s)
Kernel Trace: Septum Displacement
Kernel Trace: Septum Displacement Length Normalized
Kernel Trace: area
Kernel Trace: Length
Kernel Trace: Width
Kernel Trace: Volume
Kernel Trace: mCherry mean_intensity
Kernel Trace: Instantaneous Growth Rate: Volume
Kernel Trace: Instantaneous Growth Rate: Length


#### Get Steady State Measurements

In [33]:
strain_dict = {"ftsN":list(range(16)),\
              "glyQ":list(range(16,32)),\
              "rplA":list(range(32,48)),\
              "dnaA":list(range(48,64)),\
              "mrdA":list(range(64,80)),\
              "EV":list(range(80,96))}
inv_strain_dict = {item:key for key,val in strain_dict.items() for item in val}
gene_dict = {"ftsN":9586,\
             "glyQ":7113,\
             "rplA":4754,\
             "dnaA":8869,\
             "mrdA":9865,\
             "EV":29672}

cell_cycle_df = dd.read_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Lineage_Cell_Cycle_Merged", engine="pyarrow",calculate_divisions=True)
timepoints_df = dd.read_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Lineage_Observations_Merged", engine="pyarrow",calculate_divisions=True)
growth_df = dd.read_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-12_Lineage_Growth_Observations_Merged", engine="pyarrow",calculate_divisions=True)

cell_cycle_params = ['Birth: Length', 'Division: Length',
   'Delta: Length', 'Birth: Width',
   'Division: Width', 'Delta: Width',
   'Birth: Volume', 'Division: Volume', 'Delta: Volume',
   'Final timepoints', 'Delta Timepoints', 'Final time (s)',
   'Delta time (s)','Septum Displacement','Septum Displacement Length Normalized']
timepoints_params = ['area','Length', 'Width', 'Volume','mCherry mean_intensity']
growth_params = ['Instantaneous Growth Rate: Volume', 'Instantaneous Growth Rate: Length']
all_traced_params = cell_cycle_params + timepoints_params + growth_params

cell_cycle_df["Gene"] = cell_cycle_df["fov"].apply(lambda x: inv_strain_dict[x], meta=str)
cell_cycle_df["oDEPool7_id"] = cell_cycle_df["Gene"].apply(lambda x: gene_dict[x], meta=int)
gene_series = cell_cycle_df.groupby("Multi-Experiment Phenotype Trenchid")["Gene"].first().compute()
sgrnaid_series = cell_cycle_df.groupby("Multi-Experiment Phenotype Trenchid")["oDEPool7_id"].first().compute()

timepoints_df = timepoints_df.join(gene_series)
timepoints_df = timepoints_df.join(sgrnaid_series)

growth_df = growth_df.join(gene_series)
growth_df = growth_df.join(sgrnaid_series)



#### Use Steady State Time to Filter

In [34]:
steady_state_time = 8*3600 #8 hours
induction_time = 2*3600

trenchid_label="Multi-Experiment Phenotype Trenchid"

cell_cycle_df_ss = cell_cycle_df[cell_cycle_df["Initial time (s)"]>=steady_state_time]
cell_cycle_df_pre = cell_cycle_df[cell_cycle_df["Final time (s)"]<=induction_time]

timepoints_df_ss = timepoints_df[timepoints_df["Observation time (s)"]>=steady_state_time]
timepoints_df_pre = timepoints_df[timepoints_df["Observation time (s)"]<=induction_time]

growth_df_ss = growth_df[growth_df["Measurement time (s)"]>=steady_state_time]
growth_df_pre = growth_df[growth_df["Measurement time (s)"]<=induction_time]

#### Add sgRNA info for real data and save as checkpoint

In [35]:
n_obs = cell_cycle_df_ss.groupby("oDEPool7_id",sort=False).apply(lambda x: len(x.index.unique()), meta=int).compute()
n_obs = pd.DataFrame(n_obs).rename({0:"N Observations"}, axis=1).sort_index()
cell_cycle_df_ss = cell_cycle_df_ss.merge(n_obs,on="oDEPool7_id",how='inner')
cell_cycle_df_pre = cell_cycle_df_pre.merge(n_obs,on="oDEPool7_id",how='inner')
cell_cycle_df_ss.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-13_Lineage_Cell_Cycle_Timeseries_Steady_State/", engine="pyarrow", overwrite=True)
cell_cycle_df_pre.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-13_Lineage_Cell_Cycle_Timeseries_Preinduction/", engine="pyarrow", overwrite=True)
print("Cell Cycle Done.")

n_obs = timepoints_df_ss.groupby("oDEPool7_id",sort=False).apply(lambda x: len(x.index.unique()), meta=int).compute()
n_obs = pd.DataFrame(n_obs).rename({0:"N Observations"}, axis=1).sort_index()
timepoints_df_ss = timepoints_df_ss.merge(n_obs,on="oDEPool7_id",how='inner')
timepoints_df_pre = timepoints_df_pre.merge(n_obs,on="oDEPool7_id",how='inner')
timepoints_df_ss.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-13_Lineage_Timepoints_Timeseries_Steady_State/", engine="pyarrow", overwrite=True)
timepoints_df_pre.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-13_Lineage_Timepoints_Timeseries_Preinduction/", engine="pyarrow", overwrite=True)
print("Timepoints Done.")

n_obs = growth_df_ss.groupby("oDEPool7_id",sort=False).apply(lambda x: len(x.index.unique()), meta=int).compute()
n_obs = pd.DataFrame(n_obs).rename({0:"N Observations"}, axis=1).sort_index()
growth_df_ss = growth_df_ss.merge(n_obs,on="oDEPool7_id",how='inner')
growth_df_pre = growth_df_pre.merge(n_obs,on="oDEPool7_id",how='inner')
growth_df_ss.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-13_Lineage_Growth_Timeseries_Steady_State/", engine="pyarrow", overwrite=True)
growth_df_pre.to_parquet("/home/de64/scratch/de64/sync_folder/2021-11-12_lDE20_Validation_1/2024-01-13_Lineage_Growth_Timeseries_Preinduction/", engine="pyarrow", overwrite=True)
print("Growth Done.")
dask_controller.reset_worker_memory()



Cell Cycle Done.




Timepoints Done.




Growth Done.
Done.


In [38]:
dask_controller.shutdown()

Done.


2024-01-13 16:33:54,400 - distributed.deploy.adaptive_core - INFO - Adaptive stop
