In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad

In [2]:
adata_norm = sc.read_h5ad("data/h5ad/NormanWeissman2019_filtered.h5ad")
adata_norm

AnnData object with n_obs × n_vars = 111445 × 33694
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo'
    var: 'ensemble_id', 'ncounts', 'ncells'

In [3]:
sc.pp.normalize_total(adata_norm)
sc.pp.log1p(adata_norm)

In [4]:
df_norm = adata_norm.to_df()
df_norm.head()

Unnamed: 0,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,FO538757.2,AP006222.2,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231B
TTGAACGAGACTCGGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.651143,0.0,...,0.0,0.0,0.651143,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CGTTGGGGTGTTTGTG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.444695,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GAACCTAAGTGTTAGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCTTCCCTCCGTCATC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.892491,0.0,0.0,0.0,0.0,0.0
TCAATCTGTCTTTCAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.402884,0.0,0.689272,0.0,0.0,0.0,0.0,0.0


In [14]:
gene2id_df = pd.read_csv("result/gene2id.csv")
gene_list = list(gene2id_df["gene"])
df_norm_filt1 = df_norm[gene_list]
df_norm_filt1.shape

(111445, 2346)

In [6]:
gene2id_dict = dict(zip(list(gene2id_df["gene"]), list(gene2id_df["id"])))

In [28]:
viab_norm = np.loadtxt("result/norman19_predict_viability.txt")

In [30]:
df_perturb = pd.DataFrame({"guide_id":adata_norm.obs["guide_id"].to_list(), "perturbation":adata_norm.obs["perturbation"].to_list(), "viability":viab_norm})
df_perturb["guide_id"] = [x.split(";")[0] for x in df_perturb["guide_id"]]
df_perturb.index = df_norm.index
df_perturb.head()

Unnamed: 0,guide_id,perturbation,viability
TTGAACGAGACTCGGA,ARID1A_NegCtrl0,ARID1A,-0.20836
CGTTGGGGTGTTTGTG,BCORL1_NegCtrl0,BCORL1,-0.29483
GAACCTAAGTGTTAGA,FOSB_NegCtrl0,FOSB,0.135545
CCTTCCCTCCGTCATC,SET_KLF1,SET_KLF1,-0.317036
TCAATCTGTCTTTCAT,OSR2_NegCtrl0,OSR2,-0.131222


In [31]:
df_perturb = df_perturb.loc[df_perturb["guide_id"] != "NegCtrl1_NegCtrl0;NegCtrl1_NegCtrl0"]

In [32]:
perturb_idx = np.zeros((df_perturb.shape[0],2), dtype=int)
for i, p in enumerate(list(df_perturb["perturbation"])):
    perturb_split = p.split('_')
    if len(perturb_split) == 1:
        if perturb_split[0] in gene_list:
            perturb_idx[i][0] = gene2id_dict[perturb_split[0]]
            perturb_idx[i][1] = -1
        else:
            if perturb_split[0] == "control":
                perturb_idx[i][0] = perturb_idx[i][1] = -1
            else:
                perturb_idx[i][0] = perturb_idx[i][1] = -2
    else:
        if perturb_split[0] in gene_list and perturb_split[1] in gene_list:
            perturb_idx[i][0] = gene2id_dict[perturb_split[0]]
            perturb_idx[i][1] = gene2id_dict[perturb_split[1]]
        else:
            perturb_idx[i][0] = perturb_idx[i][1] = -2

In [33]:
df_perturb["pert_1"] = perturb_idx[:,0]
df_perturb["pert_2"] = perturb_idx[:,1]
df_perturb = df_perturb.loc[df_perturb["pert_1"]!=-2]
df_perturb.head()

Unnamed: 0,guide_id,perturbation,viability,pert_1,pert_2
TTGAACGAGACTCGGA,ARID1A_NegCtrl0,ARID1A,-0.20836,1129,-1
CGTTGGGGTGTTTGTG,BCORL1_NegCtrl0,BCORL1,-0.29483,2246,-1
GAACCTAAGTGTTAGA,FOSB_NegCtrl0,FOSB,0.135545,1689,-1
CCTTCCCTCCGTCATC,SET_KLF1,SET_KLF1,-0.317036,1656,1010
TCAATCTGTCTTTCAT,OSR2_NegCtrl0,OSR2,-0.131222,207,-1


In [34]:
print(df_perturb.shape)
#prev: (111445, 4)

(105821, 5)


In [40]:
df_norm_filt2 = df_norm_filt1.loc[df_perturb.index]
df_merged = df_perturb.join(df_norm_filt2)
print(df_merged.shape)

(105821, 2351)


In [42]:
df_merged.to_csv("./result/norm_input.csv")
df_merged.head()

Unnamed: 0,guide_id,perturbation,viability,pert_1,pert_2,GRIN3A,FREM3,DUSP5,CYBA,BACE2,...,WNT9A,RAB13,TARM1,RSAD2,TRPM5,BNIP3L,NCKAP5,CCR1,CMTM5,CCDC96
TTGAACGAGACTCGGA,ARID1A_NegCtrl0,ARID1A,-0.20836,1129,-1,0.0,0.0,0.0,0.651143,0.0,...,0.0,0.0,0.0,0.0,0.0,1.042206,0.0,0.0,0.0,0.0
CGTTGGGGTGTTTGTG,BCORL1_NegCtrl0,BCORL1,-0.29483,2246,-1,0.0,0.0,0.0,1.444695,0.0,...,0.0,0.0,0.0,0.0,0.0,0.963281,0.0,0.0,0.0,0.0
GAACCTAAGTGTTAGA,FOSB_NegCtrl0,FOSB,0.135545,1689,-1,0.0,0.0,0.0,1.258268,0.0,...,0.0,0.0,0.0,0.0,0.0,1.798179,0.0,0.0,0.0,0.0
CCTTCCCTCCGTCATC,SET_KLF1,SET_KLF1,-0.317036,1656,1010,0.0,0.0,0.0,0.307706,0.0,...,0.0,0.0,0.0,0.0,0.0,1.52671,0.0,0.0,0.0,0.0
TCAATCTGTCTTTCAT,OSR2_NegCtrl0,OSR2,-0.131222,207,-1,0.0,0.0,0.0,1.6984,0.0,...,0.0,0.402884,0.0,0.0,0.0,0.689272,0.0,0.0,0.0,0.0


split dataset

In [2]:
df_merged = pd.read_csv("./result/norm_input.csv")

In [3]:
df_merged = df_merged.drop(labels=['guide_id','perturbation'], axis=1)

In [9]:
df_double_pert = df_merged.loc[df_merged["pert_2"]!=-1]
len(df_double_pert)

38891

In [10]:
input_np_double = df_double_pert.to_numpy()
input_np_double = input_np_double[:, 1:]
input_np_double = input_np_double.astype(np.float32)
np.save("result/norm_input_double.npy", input_np_double)

In [18]:
np_double_viab = input_np_double[:,0:3]
np_double_viab[:5][:5]

array([[-3.1703648e-01,  1.6560000e+03,  1.0100000e+03],
       [-1.5707095e-01,  1.0100000e+03,  2.0520000e+03],
       [-7.4525185e-02,  9.3700000e+02,  1.6730000e+03],
       [-4.8650980e-02,  1.9420000e+03,  1.6560000e+03],
       [-2.8390670e-01,  1.9360000e+03,  9.9200000e+02]], dtype=float32)

In [19]:
np.save("result/norm_label_double.npy", np_double_viab)

In [4]:
df_single_ctrl = df_merged.loc[df_merged["pert_2"]==-1]
df_single_pert = df_single_ctrl[df_single_ctrl["pert_1"]!=-1]
df_ctrl = df_single_ctrl[df_single_ctrl["pert_1"]==-1]
print(len(df_single_pert), len(df_ctrl))

55075 11855


In [6]:
input_np_single = df_single_pert.to_numpy()
input_np_single = input_np_single[:, 1:]
input_np_single = input_np_single.astype(np.float32)
np.save("result/norm_input_single.npy", input_np_single)

In [7]:
np_single_viab = input_np_single[:,0:3]
np.save("result/norm_label_single.npy", np_single_viab)

In [8]:
input_np_ctrl = df_ctrl.to_numpy()
input_np_ctrl = input_np_ctrl[:, 1:]
input_np_ctrl = input_np_ctrl.astype(np.float32)
np.save("result/norm_input_ctrl.npy", input_np_ctrl)