## 1. load packages

In [1]:
import os
import numpy as np
import pandas as pd
import scvelo as scv

## 2. params

In [2]:
os.chdir("D:/Projects/CASH-Human/")
dir_for_loom = "./data/loom/merge.loom"

## 3. load loom data

In [3]:
ldata = scv.read(dir_for_loom, cache=False)
ldata.var_names_make_unique()
print(ldata)
print(ldata.obs)

KeyboardInterrupt: 

## 4. add meta

In [None]:
ldata.obs["patient"] = ldata.obs.index.str.split(':').str[0]
ldata.obs["barcode"] = ldata.obs.index.str.split(':').str[1]
ldata.obs['group'] = ldata.obs.patient.str[0]
ldata.obs["barcode"] = ldata.obs.barcode.str.rstrip('x')
ldata.obs['patient_id'] = ldata.obs.patient.str[1]
ldata.obs["barcode"] = ldata.obs.barcode + '-' + ldata.obs.patient_id
print(ldata.obs)
print(len(ldata.obs['barcode']))
print(len(ldata.obs['barcode'].unique()))

## 5.Subset CASH loom data

In [None]:
# subset
ldata_N = ldata[ldata.obs.group == "N"]
ldata_sub = ldata_N[ldata_N.obs.patient_id != '5']
print("ldata_N:", ldata_N)
print("ldata_sub:", ldata_sub)
print(ldata_sub.obs.head())
print("ldata_sub_patient:", pd.unique(ldata_sub.obs["patient"]))
print("ldata_sub_barcode:", len(ldata_sub.obs['barcode']))

### 5-1. meta

In [None]:
ldata_sub.obs.group = ldata_sub.obs.group.replace("N", "CASH")
ldata_sub.obs.patient = ldata_sub.obs.patient.replace({
    'N3': 'Patient1',
    'N4': 'Patient2',
    'N6': 'Patient4',
    'N7': 'Patient5'
})
ldata_sub.obs.patient_id = ldata_sub.obs.patient.str[-1]
ldata_sub.obs.barcode = ldata_sub.obs.apply(
    lambda row: row.barcode.replace(r'-' + row.barcode.split('-')[-1], '-' + row.patient_id),
    axis=1
)
print("ldata_sub:", ldata_sub)
print(ldata_sub.obs.head())
## save
ldata_sub.write('./data/loom/ldata_CASH.h5ad', compression = 'gzip')

## 6. subset ldata per cell

### 6-1 params

In [4]:
dir_for_ldata_CASH = "./data/loom/ldata_CASH.h5ad"
dir_for_obj_meta = "./data/obj_meta/"

### 6-3 load data

In [5]:
ldata = scv.read(dir_for_ldata_CASH, cache=False)
ldata.var_names_make_unique()
print("ldata:", ldata)
print(ldata.obs.head())
print("ldata:", len(ldata.obs['barcode'].unique()))

ldata: AnnData object with n_obs × n_vars = 26440 × 37487
    obs: 'patient', 'barcode', 'group', 'patient_id'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
                       patient             barcode group patient_id
CellID                                                             
N3:AAAGTAGCACCAACCGx  Patient1  AAAGTAGCACCAACCG-1  CASH          1
N3:AAATGCCCATTAACCGx  Patient1  AAATGCCCATTAACCG-1  CASH          1
N3:AAAGTAGCATGTTCCCx  Patient1  AAAGTAGCATGTTCCC-1  CASH          1
N3:AAAGTAGTCAACTCTTx  Patient1  AAAGTAGTCAACTCTT-1  CASH          1
N3:AACCATGCAGTCACTAx  Patient1  AACCATGCAGTCACTA-1  CASH          1
ldata: 26440


### 6-4 subset

In [6]:
dir_for_ldata_sub = "./data/loom/ldata_sub"
if os.path.exists(dir_for_ldata_sub):
    print("directory has been created")
else:
    os.makedirs(dir_for_ldata_sub)
    print(f"Created directory: {dir_for_ldata_sub}")

for i_file in os.listdir(dir_for_obj_meta):
    # if i_file != "T_obj_meta.csv":
    #     continue
    # i_file = "T_obj_meta.csv"
    print(i_file)
    cell = str.split(i_file, "_")[0]
    print(cell)

    ## load meta
    meta = pd.read_csv(os.path.join(dir_for_obj_meta, i_file))
    print(f"meta_{cell}:\n", len(meta))
    print(f"meta_{cell}:\n", meta.head(1))

    ## subset
    ldata_sub = ldata[np.isin(ldata.obs["barcode"], meta["barcode"])]
    print(f"ldata_sub_{cell}: {ldata_sub}")
    print(f"ldata_sub_{cell}:\n {ldata_sub.obs.head(1)}")

    ## add meta
    ### sub_celltype
    meta_matched = ldata_sub.obs[["barcode"]].merge(meta, on="barcode", how="left")
    print(f"meta_{cell}:\n", meta_matched.head(1))
    ldata_sub.obs["sub_celltype"] = meta_matched["sub_celltype"].values
    print(f"ldata_sub_{cell}:\n {ldata_sub.obs.head(1)}")

    ### UMAP coord
    ldata_sub.obsm["X_umap"] = meta_matched[["UMAP1", "UMAP2"]].values

    # print(f"Updated ldata_sub.obs:\n {ldata_sub.obs.head(1)}")
    # print(f"Updated ldata_sub.obsm['X_umap']:\n {ldata_sub.obsm['X_umap'][:1]}")


    # meta_matched = ldata_sub.obs[["barcode"]].merge(meta, on = "barcode", how = "left")
    # print(meta_matched)
    # ldata_sub.obs["cluster"] = meta_matched["sub_celltype"].values
    # ldata_sub.obs["UMAP1"] = meta_matched["UMAP1"].values
    # ldata_sub.obs["UMAP2"] = meta_matched["UMAP2"].values
    # umap = ldata_sub.obs[["barcode", "UMAP1", "UMAP2"]]
    # umap.set_index("barcode", inplace=True)
    # ldata_sub.obsm["umap"] = umap.values 
    print(ldata_sub)
    print(ldata_sub.obs.head())
    print(ldata_sub.obsm)
    ldata_sub.write(os.path.join(dir_for_ldata_sub, f"{cell}_ldata.h5ad"), compression= "gzip")

directory has been created
B_obj_meta.csv
B
meta_B:
 673
meta_B:
               barcode     orig.ident group   patient  sub_celltype     UMAP1  \
0  AAAGTAGCAGCTCGCA-1  CASH_Patient1  CASH  Patient1  c36_Bn_TCL1A -1.474113   

      UMAP2  
0 -1.295199  
ldata_sub_B: View of AnnData object with n_obs × n_vars = 673 × 37487
    obs: 'patient', 'barcode', 'group', 'patient_id'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
ldata_sub_B:
                        patient             barcode group patient_id
CellID                                                             
N3:AAAGTAGCAGCTCGCAx  Patient1  AAAGTAGCAGCTCGCA-1  CASH          1
meta_B:
               barcode     orig.ident group   patient  sub_celltype     UMAP1  \
0  AAAGTAGCAGCTCGCA-1  CASH_Patient1  CASH  Patient1  c36_Bn_TCL1A -1.474113   

      UMAP2  
0 -1.295199  
ldata_sub_B:
                        patient             barcode group patient_id  \
Ce