In [1]:
import scanpy
import hdf5plugin
import pandas
import numpy
import anndata
import scipy
import os
import scvelo
import seaborn

In [2]:
base_names = [
    "Pool3_MEF",
    "Pool1_mESC",
    "Pool3_mESC",
    "Pool3_Repro_Day2_BFP",
    "Pool3_Repro_Day2_Hic2",
    "Pool1_Repro_Day4_BFP",
    "Pool1_Repro_Day4_Hic2",
    "Pool1_Repro_Day6_BFP",
    "Pool1_Repro_Day6_Hic2",
    "Pool1_Repro_Day9_BFP",
    "Pool1_Repro_Day9_Hic2",
    "Pool3_Repro_Day12_BFP",
    "Pool3_Repro_Day12_Hic2",
]

seurat_batches = [
    'MEF',
    'Pool1_mESC',
    'Pool2_mESC',
    'Repro_Day2_BFP',
    'Repro_Day2_Hic2',
    'Repro_Day4_BFP',
    'Repro_Day4_Hic2',
    'Repro_Day6_BFP',
    'Repro_Day6_Hic2',
    'Repro_Day9_BFP',
    'Repro_Day9_Hic2',
    'Repro_Day12_BFP',
    'Repro_Day12_Hic2',
]


groups = [
    "MEF",
    "mESC",
    "mESC",
    "control",
    "Hic2",
    "control",
    "Hic2",
    "control",
    "Hic2",
    "control",
    "Hic2",
    "control",
    "Hic2",
]

days = [
    0,
    0,
    0,
    2,
    2,
    4,
    4,
    6,
    6,
    9,
    9,
    12,
    12,
]

samples = [
    "MEF",
    "mESC",
    "mESC",
    "Day 2 Control",
    "Day 2 Hic2 OX",
    "Day 4 Control",
    "Day 4 Hic2 OX",
    "Day 6 Control",
    "Day 6 Hic2 OX",
    "Day 9 Control",
    "Day 9 Hic2 OX",
    "Day 12 Control",
    "Day 12 Hic2 OX"
]

sample_order = [
    "MEF",
    "Day 2 Control",
    "Day 4 Control",
    "Day 6 Control",
    "Day 9 Control",
    "Day 12 Control",
    "Day 2 Hic2 OX",
    "Day 4 Hic2 OX",
    "Day 6 Hic2 OX",
    "Day 9 Hic2 OX",
    "Day 12 Hic2 OX",
    "mESC",
]

sample_colors = [
    '#cbc9ca',
    "#ffd1be", 
    '#f3ad91',
    '#eb684b',
    '#da2e24',
    '#a81918',
    "#ced8f0",
    '#b6d8e5',
    '#71add9',
    '#3180bd',
    '#164f9b',
    '#0a652c'
]

In [3]:
# Read Seurat object to add UMAP and FDL
adata_seurat = scanpy.read_h5ad("/scratch/lema/m26_losu/seurat_object.h5ad")
fdl = pandas.read_csv("/home/avesta/m26_losu/Hic2/fdl.csv", index_col=0)
adata_seurat.obsm['X_fdl'] = numpy.array(fdl)

In [4]:
# Combine loom files in adata

adata_files = []
directory_path = "/scratch/lema/m26_losu/loom/"

for index, base_name in enumerate(base_names):
    loom_path = f"{directory_path}/{base_name}.loom"
    group = groups[index]
    day = days[index]
    print(f"Loading file: {base_name}")
    
    # Read loom file
    adata_loom = anndata.io.read_loom(loom_path)
    adata_loom.obs["batch"] = base_name
    adata_loom.obs["sample"] = samples[index]
    adata_loom.obs["group"] = groups[index]
    adata_loom.obs["day"] = days[index]
    adata_loom.var_names_make_unique()

    # Add barcode variable
    adata_loom.obs["barcode"] = adata_loom.obs.index.to_series().str.split(':', n=1, expand=True).iloc[:, 1].str[:-1]
    adata_loom.strings_to_categoricals()

    # Add UMAP and FDL from Seurat object
    seurat_batch = seurat_batches[index]
    sample_seurat = adata_seurat[adata_seurat.obs["sample"]==seurat_batch]
    barcodes_seurat = set(sample_seurat.obs["barcode"])

    # Filter out barcodes not in Seurat object
    adata_loom = adata_loom[adata_loom.obs["barcode"].isin(barcodes_seurat)]

    # Add umap from Seurat object
    umap_map = pandas.DataFrame(
        sample_seurat.obsm['X_umap'], 
        index=sample_seurat.obs['barcode']
    )
    adata_loom.obsm['X_umap'] = umap_map.loc[adata_loom.obs["barcode"]].values

    # Add FDL
    fdl_map = pandas.DataFrame(
        sample_seurat.obsm['X_fdl'], 
        index=sample_seurat.obs['barcode']
    )
    adata_loom.obsm['X_fdl'] = fdl_map.loc[adata_loom.obs["barcode"]].values

    adata_files.append(adata_loom)

# Concatenate Anndata objects

adata = anndata.concat(adata_files)

# Assign colors to samples
adata.obs["sample"] = adata.obs["sample"].cat.reorder_categories(sample_order)
adata.uns["sample_colors"] = sample_colors

Loading file: Pool3_MEF
Loading file: Pool1_mESC
Loading file: Pool3_mESC
Loading file: Pool3_Repro_Day2_BFP
Loading file: Pool3_Repro_Day2_Hic2
Loading file: Pool1_Repro_Day4_BFP
Loading file: Pool1_Repro_Day4_Hic2
Loading file: Pool1_Repro_Day6_BFP
Loading file: Pool1_Repro_Day6_Hic2
Loading file: Pool1_Repro_Day9_BFP
Loading file: Pool1_Repro_Day9_Hic2
Loading file: Pool3_Repro_Day12_BFP
Loading file: Pool3_Repro_Day12_Hic2


In [5]:
# Save adata
output_path = "/scratch/lema/m26_losu/splice_counts_loom.h5ad"
adata.write_h5ad(
    output_path,
    compression=hdf5plugin.FILTERS["zstd"]
)

In [6]:
print(adata)

AnnData object with n_obs × n_vars = 13315 × 32285
    obs: 'batch', 'sample', 'group', 'day', 'barcode'
    uns: 'sample_colors'
    obsm: 'X_umap', 'X_fdl'
    layers: 'matrix', 'ambiguous', 'spliced', 'unspliced'
