In [1]:
import argparse
import sys
from os import path, listdir

import pandas as pd
import scipy.sparse as sp
import anndata


from types import SimpleNamespace as simplenamespace

args = simplenamespace(in_dir="../data/raw/cao_2019/", out_file="../data/h5ad/cao_2019.h5ad", force=False)

if path.exists(args.out_file) and not args.force:
    print(f"`{args.out_file}` exists. Skipping...")
    sys.exit(0)

adata = anndata.read_mtx(path.join(args.in_dir, "GSE119945_gene_count.txt.gz"))
print(adata)
adata = adata.T
print(adata)

cell_annotations = pd.read_csv(path.join(args.in_dir, "GSE119945_cell_annotate.csv.gz"))
print(cell_annotations)
adata.obs = cell_annotations.set_index("sample")

gene_annotations = pd.read_csv(path.join(args.in_dir, "GSE119945_gene_annotate.csv.gz"))
print(gene_annotations)
adata.var = gene_annotations.set_index("gene_short_name")

# Select main cluster
adata.obs = adata.obs.rename(columns={"Main_Cluster": "cluster_id"})

AnnData object with n_obs × n_vars = 26183 × 2058652
AnnData object with n_obs × n_vars = 2058652 × 26183
                                   sample  id sex   day  Total_mRNAs  \
0         sci3-me-001.GTCGGAGTTTGAGGTAGAA  38   F  13.5        335.0   
1        sci3-me-001.ATTAGTCTGTGTATAATACG   9   M  11.5       1989.0   
2         sci3-me-001.GAGGAACTTAATACCATCC  26   F  10.5        393.0   
3         sci3-me-001.TTCGCGGATACTCTCTCAA  16   F  13.5        227.0   
4        sci3-me-001.ACTGGTTATTTGCGCCATCT  33   M  12.5        241.0   
...                                   ...  ..  ..   ...          ...   
2058647   sci3-me-760.TCAGGAGATCGTAATGCAG  10   F  11.5       2730.0   
2058648  sci3-me-760.ATTCGCAATTGCCGCAACGA  21   F   9.5       1495.0   
2058649   sci3-me-760.CTAGTACGTCGTAGTTACC   4   M  10.5       2588.0   
2058650  sci3-me-760.AAACTCCAATCGCCGCCTCC  15   M  13.5       2704.0   
2058651   sci3-me-760.GTCGTAACTCGGAGATCCG  12   M  12.5       1630.0   

            tsne_1     tsne_2

In [3]:
# Add metadata
adata.uns["name"] = "cao_2019"
adata.uns["year"] = 2019
adata.uns["organism"] = "mouse"
adata.uns["tissue"] = "embryo"

In [15]:
adata.obs.dtypes

id                              int64
sex                          category
day                           float64
Total_mRNAs                   float64
tsne_1                        float64
tsne_2                        float64
cluster_id                    float64
sub_tsne_1                    float64
sub_tsne_2                    float64
Sub_Cluster                  category
db_score                      float64
detected_doublet               object
potential_doublet_cluster      object
dtype: object

In [17]:
adata.obs.head()

Unnamed: 0_level_0,id,sex,day,Total_mRNAs,tsne_1,tsne_2,cluster_id,sub_tsne_1,sub_tsne_2,Sub_Cluster,db_score,detected_doublet,potential_doublet_cluster
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
sci3-me-001.GTCGGAGTTTGAGGTAGAA,38,F,13.5,335.0,2.502835,2.589219,5.0,0.0,0.0,unknown,0.013367,False,
sci3-me-001.ATTAGTCTGTGTATAATACG,9,M,11.5,1989.0,19.352971,-5.523709,20.0,4.01933,18.306513,1,0.00416,False,False
sci3-me-001.GAGGAACTTAATACCATCC,26,F,10.5,393.0,-15.372493,-5.7596,21.0,0.0,0.0,unknown,0.002524,False,
sci3-me-001.TTCGCGGATACTCTCTCAA,16,F,13.5,227.0,-0.750546,1.439024,17.0,0.0,0.0,unknown,0.006938,False,
sci3-me-001.ACTGGTTATTTGCGCCATCT,33,M,12.5,241.0,2.402283,12.564681,6.0,0.0,0.0,unknown,0.002918,False,


In [25]:
adata.obs["potential_doublet_cluster"] = adata.obs["potential_doublet_cluster"].astype(str)
adata.obs["detected_doublet"] = adata.obs["detected_doublet"].astype(str)

In [26]:
adata.write_h5ad(args.out_file)