In [2]:
import os, sys
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad

In [3]:
main_dir = os.path.dirname(os.getcwd())
data_dir = f"{main_dir}/data/HoxB8"
os.listdir(data_dir)

['GSE146128_TFnet_exon_counts.csv',
 'GSE146128_exp1_exon_counts.csv',
 'GSE146128_Tfnet_interactions_exon_counts.csv',
 'GSE146128_TFnet_exon_counts.h5ad',
 'GSE146128_Tfnet_interactions_exon_counts.h5ad']

In [9]:
TFnet_df = pd.read_csv(os.path.join(data_dir, 'GSE146128_TFnet_exon_counts.csv'), low_memory=False)
TFnet_df = TFnet_df.set_index("Unnamed: 0")

In [18]:
print(TFnet_df.shape)
TFnet_df.sample(3)

(54331, 1148)


Unnamed: 0_level_0,RBG18345_Plate5_A1_Gata3_sg1,RBG18346_Plate5_B1_Gata3_sg1,RBG18347_Plate5_C1_Gata3_sg1,RBG18348_Plate5_D1_Gata3_sg1,RBG18349_Plate5_E1_Gata3_sg1,RBG18350_Plate5_F1_Gata3_sg1,RBG18351_Plate5_G1_Gata3_sg1,RBG18352_Plate5_H1_Gata3_sg1,RBG18353_Plate5_A2_Gata3_sg2,RBG18354_Plate5_B2_Gata3_sg2,...,RBG24127_Plate18_G11_Cbfb_sg2,RBG24128_Plate18_H11_Cbfb_sg2,RBG24129_Plate18_A12_Cbfb_sg3,RBG24130_Plate18_B12_Cbfb_sg3,RBG24131_Plate18_C12_Cbfb_sg3,RBG24132_Plate18_D12_Cbfb_sg3,RBG24133_Plate18_E12_Cbfb_sg3,RBG24134_Plate18_F12_Cbfb_sg3,RBG24135_Plate18_G12_Cbfb_sg3,RBG24136_Plate18_H12_Cbfb_sg3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000101261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000112354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000028937,357,312,288,262,390,313,328,302,513,272,...,525,620,571,463,428,400,356,717,524,531


In [64]:
sample_names = list(TFnet_df.columns)

renames = [name.replace("emptyV","emptyV_sg0") for name in sample_names] 
renames = [name.replace("emptyV_sg0_b2a","emptyV_b2a") for name in renames]
renames = [name.replace("R26","R26_sg0") for name in renames] 
renames = [name.replace("noB","noB_sg0") for name in renames]

designs = np.stack([name.split("_") for name in renames])

In [78]:
design_df = pd.DataFrame(designs, columns=['Sample', 'Plate', 'Well', 'TF', 'Guide'])
design_df = design_df.set_index("Sample")

var = pd.DataFrame(TFnet_df.index.values, columns=['Gene'])
var.index = TFnet_df.index

X = TFnet_df.values.T

adata = ad.AnnData(X=X , obs=design_df, var=var)
adata.layers['Counts'] = X
adata.write_h5ad(f"{data_dir}/GSE146128_TFnet_exon_counts.h5ad")

## GSE146128_Tfnet_interactions

In [4]:
TFinter_df = pd.read_csv(os.path.join(data_dir, 'GSE146128_Tfnet_interactions_exon_counts.csv'), low_memory=False, index_col=0)

In [5]:
print(TFinter_df.shape)
designs = list(TFinter_df.columns)

(54331, 264)


In [18]:
def process_name(name):

    items = name.split("_")
    perturb1 = None
    perturb2 = None
    guide = None

    if len(items) == 4:
        sample, plate, well, perturb1 = items
    elif len(items) == 5:
        sample, plate, well, perturb1, guide = items
    elif len(items) == 6:
        sample, plate, well, perturb1, perturb2, guide = items
    
    return sample, plate, well, perturb1, perturb2, guide


In [21]:
design_M = np.stack([process_name(name) for name in designs])
obs = pd.DataFrame(design_M, columns=['Sample', 'Plate', 'Well', 'Perturb1', 'Perturb2', 'Guide'])
obs = obs.set_index("Sample")

In [22]:
var = pd.DataFrame(TFinter_df.index.values, columns=['Gene'])
var = var.set_index("Gene")

In [24]:
adata2 = ad.AnnData(X = TFinter_df.values.T, obs=obs, var=var)
adata2.layers['Count'] = TFinter_df.values.T
adata2.write_h5ad(f"{data_dir}/GSE146128_Tfnet_interactions_exon_counts.h5ad")

In [26]:
adata2

AnnData object with n_obs × n_vars = 264 × 54331
    obs: 'Plate', 'Well', 'Perturb1', 'Perturb2', 'Guide'
    layers: 'Count'

## GSE146128_exp1_exon_counts.csv

In [5]:
exp1_df = pd.read_csv(f"{data_dir}/GSE146128_exp1_exon_counts.csv", index_col=0,low_memory=False)
designs = list(exp1_df.columns)

In [6]:
np.unique([[n.count("_") for n in designs]])

array([5])

In [8]:
design_M = np.stack([n.split("_") for n in designs])

In [9]:
obs = pd.DataFrame(design_M, columns=['Sample', 'Plate', 'Experiment', 'Well', 'Perturb', 'Time'])
obs = obs.set_index('Sample')

In [11]:
var = pd.DataFrame(exp1_df.index.values, columns=['Gene'])
var = var.set_index("Gene")

In [13]:
adata3 = ad.AnnData(X = exp1_df.values.T, obs=obs, var=var)
adata3.layers['Count'] = exp1_df.values.T

adata3.write_h5ad(f"{data_dir}/GSE146128_exp1_exon_counts.h5ad")