Sanity checks to ensure that the python-based prediction and projection functions are consistent with Hooke functions

In [None]:
from src.seq.hooke_latent_projections.project_ccs_data import *
import pandas as pd
import numpy as np
import os

In [None]:
# root = "/media/nick/hdd02/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
model_name = "bead_expt_linear"

### Just pasting this chunk directly from the main wrapper function

In [None]:
hooke_data_path = os.path.join(root, "seq_data/emb_projections/hooke_model_files", "")
ccs_data_path = os.path.join(root, "seq_data/emb_projections/ccs_data_cell_type_broad", "")
model_path = os.path.join(hooke_data_path, model_name, "")

# make save dir
out_dir = os.path.join(root, "seq_data", "emb_projections", "latent_projections", model_name, "")
os.makedirs(out_dir, exist_ok=True)

# load in model parameters
# load full counts dataset
hooke_counts_long = pd.read_csv(model_path + "abundance_estimates.csv", index_col=0)
cols = list(hooke_counts_long.columns)
cell_ind = cols.index("cell_group")
cov_cols = cols[:cell_ind]
hooke_counts_df = hooke_counts_long.pivot(index=cov_cols,
                                           columns=["cell_group"], values = ["log_abund"])
hooke_counts_df.columns = ['_'.join(map(str, col)).strip('_') for col in hooke_counts_df.columns.values]
hooke_counts_df.reset_index(inplace=True)
new_cols = [col.replace("log_abund_", "") for col in hooke_counts_df.columns.values]
hooke_counts_df.columns = new_cols
sort_cols = new_cols[:cell_ind] + sorted(new_cols[cell_ind:], key=str.lower)
hooke_counts_df = hooke_counts_df.loc[:, sort_cols]

# make stripped-down metadata df
meta_df = hooke_counts_df[cov_cols].copy()
meta_df.loc[:, "dummy_response"] = 0

# load hooke predictions (for comparison purposes)
# latent_df = pd.read_csv(model_path + "latents.csv", index_col=0)
spline_lookup_df = pd.read_csv(model_path + "time_splines.csv")

# load hooke model files
cov_array = pd.read_csv(model_path + "COV.csv", index_col=0)
beta_array = pd.read_csv(model_path + "B.csv", index_col=0).T

# latent_df.head()
beta_array = beta_array.rename(columns={"(Intercept)":"Intercept"})
cols_from = beta_array.columns
cols_from_clean = [col.replace(" = c", "=") for col in cols_from]
beta_array.columns = cols_from_clean
beta_array.head()

# model formula
with open(model_path + "model_string.txt", "r") as file:
    formula_str = file.read().strip()
model_desc = patsy.ModelDesc.from_formula(formula_str)
# Extract covariate names from the right-hand side terms.
cov_factors = []
for term in model_desc.rhs_termlist:
    for factor in term.factors:
        # factor is a EvalFactor, convert it to string.
        cov_factors.append(str(factor).replace("EvalFactor('","").replace("')",""))
cov_factors = np.unique([cov for cov in cov_factors if "ns(" not in cov]).tolist()

# load in full counts table and metadata used for model inference
mdl_counts_df = pd.read_csv(model_path + "mdl_counts_table.csv", index_col=0).T
mdl_meta_df = pd.read_csv(model_path + "mdl_embryo_metadata.csv", index_col=0)

####################
# load in ccs table

# get list of all ccs tables
count_suffix = "_counts_table.csv"
meta_suffix = "_metadata.csv"

ccs_path_list = sorted(glob.glob(ccs_data_path + "*" + count_suffix))
ccs_name_list = [os.path.basename(p).replace(count_suffix, "") for p in ccs_path_list]

# compile master count and metadata tables
mdl_cell_types = mdl_counts_df.columns
ccs_df_list = []
meta_df_list = []
for ccs_name in tqdm(ccs_name_list):
    ccs_temp = pd.read_csv(ccs_data_path + ccs_name + count_suffix, index_col=0).T
    ccs_temp = ccs_temp.reindex(columns=mdl_cell_types, fill_value=0)
    ccs_df_list.append(ccs_temp)
    meta_temp = pd.read_csv(ccs_data_path + ccs_name + meta_suffix, index_col=0)
    meta_df_list.append(meta_temp)

# concatenate
ccs_df = pd.concat(ccs_df_list, axis=0).drop_duplicates()
meta_df = pd.concat(meta_df_list, axis=0).drop_duplicates()

ccs_df = ccs_df.loc[~ccs_df.index.duplicated(keep='first')]
meta_df = meta_df.loc[~meta_df.index.duplicated(keep='first')].set_index("sample")

meta_df["pert_collapsed"] = meta_df["perturbation"].copy()
conv_list = np.asarray(["ctrl-uninj", "reference", "novehicle"])
meta_df.loc[np.isin(meta_df["pert_collapsed"], conv_list), "pert_collapsed"] = "ctrl"

# keep only embryos from experiments that were included in model inference
exp_vec = mdl_meta_df.loc[:, "expt"].unique()
exp_filter = np.isin(meta_df["expt"], exp_vec)
meta_df = meta_df.loc[exp_filter, :]
ccs_df = ccs_df.loc[exp_filter, :]

# augment ccs table to incorporate missing cell types
# mdl_cell_types = mdl_counts_df.columns
# ccs_df = ccs_df.reindex(columns=mdl_cell_types, fill_value=0)

# check which ones were included in inference
mdl_flags = np.isin(np.asarray(ccs_df.index), np.asarray(mdl_counts_df.index))
meta_df["inference_flag"] = mdl_flags

# flag experiments that were not included in inference
mdl_experiments = np.unique(mdl_meta_df["expt"])
oos_vec = ~np.isin(meta_df["expt"], mdl_experiments)
meta_df["oos_expt_flag"] = oos_vec

####
# model parameters
####

# inverse cov matrix
PHI = np.linalg.inv(cov_array)
# COV = cov_array.to_numpy()

# regression vars
THETA = beta_array.copy()

# zi0_vec = [0] * COV.shape[0]

# covariates
cov_col_list = beta_array.columns.tolist()

### Get mean predictions for log(A) and compare to original Hooke predictions

In [None]:
x_seed = hooke_counts_df.drop_duplicates(subset=["timepoint", "expt"]).reset_index(drop=True)
x_seed.head()

In [None]:
# construct covarate matrix
X_list = []
for e in tqdm(range(x_seed.shape[0])):
    # get embryo info
    # embryo_id = ccs_df.index[embryo_ind]
    # raw_counts = ccs_df.loc[embryo_id, :].to_numpy()

    cov_dict = dict({cov: x_seed.loc[e, cov] for cov in cov_factors})
    stage = x_seed.loc[e, "timepoint"]
    # size_factor_log = np.log(meta_df.loc[embryo_id, "Size_Factor"])

    # construct initial covariate vec
    X0 = construct_X(stage, cov_dict, cov_col_list=cov_col_list, spline_lookup_df=spline_lookup_df)
    # X0.index = [embryo_id]
    X_list.append(X0)

X = pd.concat(X_list, axis=0, ignore_index=True)
# X = X.drop_duplicates(subset=["
X.head()

In [None]:
# generate abundance predictions and compare to Hooke output
python_pd_df = (X @ THETA.T)
cell_cols = python_pd_df.columns.tolist()
python_pd_df.head()

In [None]:
py_log_counts = python_pd_df.iloc[:, :].to_numpy().ravel()
hooke_log_counts = python_pd_df.loc[:, cell_cols].to_numpy().ravel()

fig = px.scatter(x=py_log_counts, y=hooke_log_counts)
fig.update_layout(xaxis=dict(title="predicted log cell abundances (python)"), 
                  yaxis=dict(title="predicted log cell abundances (Hooke/R)"), )
fig.show()

A related qu

In [None]:
hooke_log_counts

In [None]:
}