# Imports & file locations

In [1]:
%load_ext autoreload
%autoreload 2

import glob
import itertools
import os

import numpy as np
import scanpy 
import scanpy as sc
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.metrics import normalized_mutual_info_score

import anndata

# Joblib for parallelizing
from joblib import Parallel, delayed 

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100


anndata.__version__

scanpy.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
scanpy.logging.print_versions()
scanpy.set_figure_params(frameon=False, color_map='magma_r')

outdir_gdrive = '/home/olga/googledrive/TabulaMicrocebus/data/cross-species'
outdir_local = '/home/olga/data_lg/data_sm_copy/tabula-microcebus/data-objects/cross-species'
outdirs = outdir_local, outdir_gdrive
# !ls -lha $outdir_local

scanpy==1.4.6.dev9+ge632939 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.2 pandas==0.25.3 scikit-learn==0.22 statsmodels==0.10.2 python-igraph==0.7.1 louvain==0.6.1


# Load data

In [2]:
%%time
h5ad = f'{outdir_local}/mini_adata_30cells_xspecies_at2_macrophage.h5ad'
adata = scanpy.read_h5ad(h5ad)
adata

CPU times: user 117 ms, sys: 17.6 ms, total: 134 ms
Wall time: 149 ms


AnnData object with n_obs × n_vars = 30 × 11761 
    obs: 'age', 'batch', 'cell', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'channel_no_10x', 'channel_tissue', 'compartment', 'compartment_id', 'free_annotation', 'individual', 'location', 'magnetic.selection', 'method', 'mouse.id', 'nGene', 'nUMI', 'n_barcodes', 'n_counts', 'n_genes', 'orig.ident', 'original_channel', 'patient', 'percent.ribo', 'possibly_contaminated_barcode', 'preparation.site', 'region', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'subtissue', 'tissue', 'tissue_free_annotation', 'compartment_updated_from_human', 'compartment_updated_from_human__with_species', 'compartment_updated_from_lemur', 'compartment_updated_from_lemur__with_species', 'narrow_group', 'broad_group', 'compartment_group', 'log_counts', 'sqrt_counts', 'channel_original', 'peptides_aligned', 'peptides_unaligned'
    var: 'mouse_lemur__gene_name', 'mouse_lemur_to_human__homology_type', 'hu

In [29]:
adata.obs[['peptides_aligned', 'peptides_unaligned']].head().values

array([['HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TGCTACCCATTAGCCA__coding_reads_peptides.fasta',
        'HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__TGCTACCCATTAGCCA__coding_reads_peptides.fasta'],
       ['HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACACCGGAGTTACGGG__coding_reads_peptides.fasta',
        'HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__ACACCGGAGTTACGGG__coding_reads_peptides.fasta'],
       ['HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CCATGTCAGCTACCTA__coding_reads_peptides.fasta',
        'HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__CCATGTCAGCTACCTA__coding_reads_peptides.fasta'],
       ['HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAGCCGAAGGAGCGTT__coding_reads_peptides.fasta',
        'HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__CAGCCGAAGGAGCGTT__coding_reads_peptides.fasta'],
       ['HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TATGCCCAGGATGGAA__coding_reads_peptides.fasta',
        'HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__TATGCCCAGGATGGAA__coding_reads_pep

In [36]:
for d in outdirs:
    adata.obs[['peptides_aligned', 'peptides_unaligned']].to_csv(
        f"{d}/mini_adata_30cells_xspecies_at2_macrophage__peptide_fasta_basenames.csv")

In [3]:
adata.obs.head()

Unnamed: 0,age,batch,cell,cell_barcode,cell_ontology_class,cell_ontology_id,channel,channel_no_10x,channel_tissue,compartment,compartment_id,free_annotation,individual,location,magnetic.selection,method,mouse.id,nGene,nUMI,n_barcodes,n_counts,n_genes,orig.ident,original_channel,patient,percent.ribo,possibly_contaminated_barcode,preparation.site,region,sample,sequencing_run,sex,species,species_batch,species_latin,subtissue,tissue,tissue_free_annotation,compartment_updated_from_human,compartment_updated_from_human__with_species,compartment_updated_from_lemur,compartment_updated_from_lemur__with_species,narrow_group,broad_group,compartment_group,log_counts,sqrt_counts,channel_original,peptides_aligned,peptides_unaligned
P1_2_TGCTACCCATTAGCCA-human,,,,TGCTACCCATTAGCCA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4931.0,32187.0,,3299,3299,Human Lung 10x - Patient 1,,1.0,0.121198,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.668588,125.749748,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TGC...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...
P1_2_ACACCGGAGTTACGGG-human,,,,ACACCGGAGTTACGGG,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4510.0,28732.0,,3030,3030,Human Lung 10x - Patient 1,,1.0,0.125331,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.614071,122.368294,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...
P1_2_CCATGTCAGCTACCTA-human,,,,CCATGTCAGCTACCTA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4490.0,31702.0,,3014,3014,Human Lung 10x - Patient 1,,1.0,0.156457,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.865422,138.755173,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CCA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...
P1_2_CAGCCGAAGGAGCGTT-human,,,,CAGCCGAAGGAGCGTT,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4333.0,28696.0,,2965,2965,Human Lung 10x - Patient 1,,1.0,0.198146,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.720886,129.081375,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...
P1_2_TATGCCCAGGATGGAA-human,,,,TATGCCCAGGATGGAA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4147.0,19244.0,,2765,2765,Human Lung 10x - Patient 1,,1.0,0.151736,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.359708,107.754349,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TAT...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...


In [4]:
adata.obs['cell_barcode']

P1_2_TGCTACCCATTAGCCA-human                      TGCTACCCATTAGCCA
P1_2_ACACCGGAGTTACGGG-human                      ACACCGGAGTTACGGG
P1_2_CCATGTCAGCTACCTA-human                      CCATGTCAGCTACCTA
P1_2_CAGCCGAAGGAGCGTT-human                      CAGCCGAAGGAGCGTT
P1_2_TATGCCCAGGATGGAA-human                      TATGCCCAGGATGGAA
Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG-lemur    CCCAATCGTTGTCGCG
Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA-lemur    TAAGTGCAGCCCGAAA
Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC-lemur    TCTCATACATTCCTGC
Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT-lemur    AGATCTGTCCGCGTTT
Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA-lemur    TTAACTCGTAAGGGAA
ACCTTTAAGGGCTCTC-1-53-0-0-mouse                  ACCTTTAAGGGCTCTC
CACACAATCCTCAACC-1-53-0-0-mouse                  CACACAATCCTCAACC
CGGTTAACACCTATCC-1-53-0-0-mouse                  CGGTTAACACCTATCC
GCACTCTCAACGATCT-1-53-0-0-mouse                  GCACTCTCAACGATCT
CTACATTCACCGAATT-1-53-0-0-mouse                  CTACATTCACCGAATT
P1_2_CACAC

In [5]:
adata.obs.channel

P1_2_TGCTACCCATTAGCCA-human                                        P1_2
P1_2_ACACCGGAGTTACGGG-human                                        P1_2
P1_2_CCATGTCAGCTACCTA-human                                        P1_2
P1_2_CAGCCGAAGGAGCGTT-human                                        P1_2
P1_2_TATGCCCAGGATGGAA-human                                        P1_2
Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG-lemur    Antoine_Lung_EPCAM_10X
Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA-lemur    Antoine_Lung_EPCAM_10X
Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC-lemur    Antoine_Lung_EPCAM_10X
Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT-lemur    Antoine_Lung_EPCAM_10X
Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA-lemur    Antoine_Lung_EPCAM_10X
ACCTTTAAGGGCTCTC-1-53-0-0-mouse                      MACA_21m_F_LUNG_55
CACACAATCCTCAACC-1-53-0-0-mouse                      MACA_21m_F_LUNG_55
CGGTTAACACCTATCC-1-53-0-0-mouse                      MACA_21m_F_LUNG_55
GCACTCTCAACGATCT-1-53-0-0-mouse                      MACA_21m_F_

In [6]:
channel_mapper = {'P1_2': 'HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31', 
                  'Antoine_Lung_EPCAM_10X': 'ANTOINE_LUNG_EPCAM_POS',
                 'MACA_21m_F_LUNG_55': 'MACA_21m_F_LUNG_55'}

adata.obs['channel_updated'] = adata.obs.channel.map(channel_mapper)
adata.obs['channel_updated']

P1_2_TGCTACCCATTAGCCA-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_ACACCGGAGTTACGGG-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_CCATGTCAGCTACCTA-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_CAGCCGAAGGAGCGTT-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_TATGCCCAGGATGGAA-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG-lemur              ANTOINE_LUNG_EPCAM_POS
Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA-lemur              ANTOINE_LUNG_EPCAM_POS
Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC-lemur              ANTOINE_LUNG_EPCAM_POS
Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT-lemur              ANTOINE_LUNG_EPCAM_POS
Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA-lemur              ANTOINE_LUNG_EPCAM_POS
ACCTTTAAGGGCTCTC-1-53-0-0-mouse                                MACA_21m_F_LUNG_55
CACACAATCCTCAACC-1-53-0-0-mouse                                MACA_21m_F_LUNG_55
CGGTTAACACCTATCC

In [7]:
adata.obs.head()

Unnamed: 0,age,batch,cell,cell_barcode,cell_ontology_class,cell_ontology_id,channel,channel_no_10x,channel_tissue,compartment,compartment_id,free_annotation,individual,location,magnetic.selection,method,mouse.id,nGene,nUMI,n_barcodes,n_counts,n_genes,orig.ident,original_channel,patient,percent.ribo,possibly_contaminated_barcode,preparation.site,region,sample,sequencing_run,sex,species,species_batch,species_latin,subtissue,tissue,tissue_free_annotation,compartment_updated_from_human,compartment_updated_from_human__with_species,compartment_updated_from_lemur,compartment_updated_from_lemur__with_species,narrow_group,broad_group,compartment_group,log_counts,sqrt_counts,channel_original,peptides_aligned,peptides_unaligned,channel_updated
P1_2_TGCTACCCATTAGCCA-human,,,,TGCTACCCATTAGCCA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4931.0,32187.0,,3299,3299,Human Lung 10x - Patient 1,,1.0,0.121198,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.668588,125.749748,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TGC...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_ACACCGGAGTTACGGG-human,,,,ACACCGGAGTTACGGG,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4510.0,28732.0,,3030,3030,Human Lung 10x - Patient 1,,1.0,0.125331,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.614071,122.368294,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_CCATGTCAGCTACCTA-human,,,,CCATGTCAGCTACCTA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4490.0,31702.0,,3014,3014,Human Lung 10x - Patient 1,,1.0,0.156457,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.865422,138.755173,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CCA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_CAGCCGAAGGAGCGTT-human,,,,CAGCCGAAGGAGCGTT,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4333.0,28696.0,,2965,2965,Human Lung 10x - Patient 1,,1.0,0.198146,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.720886,129.081375,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31
P1_2_TATGCCCAGGATGGAA-human,,,,TATGCCCAGGATGGAA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4147.0,19244.0,,2765,2765,Human Lung 10x - Patient 1,,1.0,0.151736,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.359708,107.754349,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TAT...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31


In [8]:
adata.obs['cell_id_no_species'] = adata.obs.apply(lambda x: x.name.split(f'-{x.species_batch}')[0], axis=1)
adata.obs['cell_id_no_species']

P1_2_TGCTACCCATTAGCCA-human                                        P1_2_TGCTACCCATTAGCCA
P1_2_ACACCGGAGTTACGGG-human                                        P1_2_ACACCGGAGTTACGGG
P1_2_CCATGTCAGCTACCTA-human                                        P1_2_CCATGTCAGCTACCTA
P1_2_CAGCCGAAGGAGCGTT-human                                        P1_2_CAGCCGAAGGAGCGTT
P1_2_TATGCCCAGGATGGAA-human                                        P1_2_TATGCCCAGGATGGAA
Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG-lemur    Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG
Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA-lemur    Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA
Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC-lemur    Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC
Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT-lemur    Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT
Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA-lemur    Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA
ACCTTTAAGGGCTCTC-1-53-0-0-mouse                                ACCTTTAAGGGCTCTC-1-53-0-0
CACACAATCCTCAACC-1-53

# Read original h5ads

In [9]:
human_cell_ids = adata.obs.query('species_batch == "human"')['cell_id_no_species']
human_cell_ids

P1_2_TGCTACCCATTAGCCA-human    P1_2_TGCTACCCATTAGCCA
P1_2_ACACCGGAGTTACGGG-human    P1_2_ACACCGGAGTTACGGG
P1_2_CCATGTCAGCTACCTA-human    P1_2_CCATGTCAGCTACCTA
P1_2_CAGCCGAAGGAGCGTT-human    P1_2_CAGCCGAAGGAGCGTT
P1_2_TATGCCCAGGATGGAA-human    P1_2_TATGCCCAGGATGGAA
P1_2_CACACAAGTCCTAGCG-human    P1_2_CACACAAGTCCTAGCG
P1_2_TTGCGTCTCCAAGTAC-human    P1_2_TTGCGTCTCCAAGTAC
P1_2_ACAGCCGAGCTAGTTC-human    P1_2_ACAGCCGAGCTAGTTC
P1_2_AGGTCCGCAGGACGTA-human    P1_2_AGGTCCGCAGGACGTA
P1_2_ACGCCAGGTGTCTGAT-human    P1_2_ACGCCAGGTGTCTGAT
Name: cell_id_no_species, dtype: object

## Read human h5ad

In [10]:
human_folder = '/home/olga/googledrive/TabulaMicrocebus/data/human-lung-cell-atlas--from-kyle'
h5ad = f"{human_folder}/droplet_normal_lung_blood_P1-3__no_duplicate_barcodes.h5ad"
human = scanpy.read_h5ad(h5ad)
human

AnnData object with n_obs × n_vars = 61830 × 26485 
    obs: 'nGene', 'nUMI', 'orig.ident', 'channel', 'tissue', 'region', 'percent.ribo', 'free_annotation', 'patient', 'sample', 'location', 'magnetic.selection', 'preparation.site', 'compartment', 'species', 'species_latin', 'cell_ontology_class', 'sequencing_run', 'cell_barcode'

In [11]:
human_minitest = human[human_cell_ids.values]

In [12]:
human_minitest.obs.head()

Unnamed: 0_level_0,nGene,nUMI,orig.ident,channel,tissue,region,percent.ribo,free_annotation,patient,sample,location,magnetic.selection,preparation.site,compartment,species,species_latin,cell_ontology_class,sequencing_run,cell_barcode
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
P1_2_TGCTACCCATTAGCCA,4931,32187,Human Lung 10x - Patient 1,P1_2,lung,normal,0.121198,Alveolar Epithelial Type 2,1,distal 1a,distal,epithelial,biohub,epithelial,Human,Homo sapiens,type II pneumocyte,171205_A00111_0088_BH55NYDMXX,TGCTACCCATTAGCCA
P1_2_ACACCGGAGTTACGGG,4510,28732,Human Lung 10x - Patient 1,P1_2,lung,normal,0.125331,Alveolar Epithelial Type 2,1,distal 1a,distal,epithelial,biohub,epithelial,Human,Homo sapiens,type II pneumocyte,171205_A00111_0088_BH55NYDMXX,ACACCGGAGTTACGGG
P1_2_CCATGTCAGCTACCTA,4490,31702,Human Lung 10x - Patient 1,P1_2,lung,normal,0.156457,Alveolar Epithelial Type 2,1,distal 1a,distal,epithelial,biohub,epithelial,Human,Homo sapiens,type II pneumocyte,171205_A00111_0088_BH55NYDMXX,CCATGTCAGCTACCTA
P1_2_CAGCCGAAGGAGCGTT,4333,28696,Human Lung 10x - Patient 1,P1_2,lung,normal,0.198146,Alveolar Epithelial Type 2,1,distal 1a,distal,epithelial,biohub,epithelial,Human,Homo sapiens,type II pneumocyte,171205_A00111_0088_BH55NYDMXX,CAGCCGAAGGAGCGTT
P1_2_TATGCCCAGGATGGAA,4147,19244,Human Lung 10x - Patient 1,P1_2,lung,normal,0.151736,Alveolar Epithelial Type 2,1,distal 1a,distal,epithelial,biohub,epithelial,Human,Homo sapiens,type II pneumocyte,171205_A00111_0088_BH55NYDMXX,TATGCCCAGGATGGAA


In [13]:
human_minitest.obs['nUMI']

index
P1_2_TGCTACCCATTAGCCA    32187
P1_2_ACACCGGAGTTACGGG    28732
P1_2_CCATGTCAGCTACCTA    31702
P1_2_CAGCCGAAGGAGCGTT    28696
P1_2_TATGCCCAGGATGGAA    19244
P1_2_CACACAAGTCCTAGCG    61243
P1_2_TTGCGTCTCCAAGTAC    55094
P1_2_ACAGCCGAGCTAGTTC    44566
P1_2_AGGTCCGCAGGACGTA    43260
P1_2_ACGCCAGGTGTCTGAT    45539
Name: nUMI, dtype: int64

In [14]:
human_minitest_df = human_minitest.to_df()
human_minitest_df.sum(axis=1).astype(int)

index
P1_2_TGCTACCCATTAGCCA    32187
P1_2_ACACCGGAGTTACGGG    28732
P1_2_CCATGTCAGCTACCTA    31702
P1_2_CAGCCGAAGGAGCGTT    28696
P1_2_TATGCCCAGGATGGAA    19244
P1_2_CACACAAGTCCTAGCG    61243
P1_2_TTGCGTCTCCAAGTAC    55094
P1_2_ACAGCCGAGCTAGTTC    44566
P1_2_AGGTCCGCAGGACGTA    43260
P1_2_ACGCCAGGTGTCTGAT    45539
dtype: int64

## Read mouse h5ad

In [15]:
h5ad = '/home/olga/data_lg/data_sm_copy/czb-tabula-muris-senis/Data-objects/tabula-muris-senis-droplet-official-raw-obj--no-duplicate-barcodes-per-seq-run.h5ad'
mouse = scanpy.read_h5ad(h5ad)
mouse

AnnData object with n_obs × n_vars = 238915 × 20138 
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'species', 'species_latin', 'channel', 'channel_tissue', 'sequencing_run', 'cell_barcode', 'n_barcodes'
    var: 'n_cells'

In [16]:
mouse_cell_ids = adata.obs.query('species_batch == "mouse"')['cell_id_no_species']
mouse_cell_ids

ACCTTTAAGGGCTCTC-1-53-0-0-mouse    ACCTTTAAGGGCTCTC-1-53-0-0
CACACAATCCTCAACC-1-53-0-0-mouse    CACACAATCCTCAACC-1-53-0-0
CGGTTAACACCTATCC-1-53-0-0-mouse    CGGTTAACACCTATCC-1-53-0-0
GCACTCTCAACGATCT-1-53-0-0-mouse    GCACTCTCAACGATCT-1-53-0-0
CTACATTCACCGAATT-1-53-0-0-mouse    CTACATTCACCGAATT-1-53-0-0
GCACTCTTCGGTCCGA-1-53-0-0-mouse    GCACTCTTCGGTCCGA-1-53-0-0
CAAGAAAGTTGACGTT-1-53-0-0-mouse    CAAGAAAGTTGACGTT-1-53-0-0
GACCAATTCACAGTAC-1-53-0-0-mouse    GACCAATTCACAGTAC-1-53-0-0
GGGTTGCTCCGCAGTG-1-53-0-0-mouse    GGGTTGCTCCGCAGTG-1-53-0-0
GCGCCAAAGTAAGTAC-1-53-0-0-mouse    GCGCCAAAGTAAGTAC-1-53-0-0
Name: cell_id_no_species, dtype: object

In [17]:
mouse_minitest = mouse[mouse_cell_ids]
mouse_minitest.obs.head()

Unnamed: 0_level_0,age,cell,cell_ontology_class,cell_ontology_id,free_annotation,method,mouse.id,n_genes,sex,subtissue,tissue,tissue_free_annotation,species,species_latin,channel,channel_tissue,sequencing_run,cell_barcode,n_barcodes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ACCTTTAAGGGCTCTC-1-53-0-0,21m,MACA_21m_F_LUNG_55_ACCTTTAAGGGCTCTC,type II pneumocyte,,Alveolar Epithelial Type 2,droplet,21-F-55,3218.0,female,,Lung,Lung,Mouse,Mus musculus,MACA_21m_F_LUNG_55,MACA_21m_F_LUNG_55__Lung,171024_A00111_0078_AH3YHCDMXX,ACCTTTAAGGGCTCTC,1
CACACAATCCTCAACC-1-53-0-0,21m,MACA_21m_F_LUNG_55_CACACAATCCTCAACC,type II pneumocyte,,Alveolar Epithelial Type 2,droplet,21-F-55,3016.0,female,,Lung,Lung,Mouse,Mus musculus,MACA_21m_F_LUNG_55,MACA_21m_F_LUNG_55__Lung,171024_A00111_0078_AH3YHCDMXX,CACACAATCCTCAACC,1
CGGTTAACACCTATCC-1-53-0-0,21m,MACA_21m_F_LUNG_55_CGGTTAACACCTATCC,type II pneumocyte,,Alveolar Epithelial Type 2,droplet,21-F-55,2826.0,female,,Lung,Lung,Mouse,Mus musculus,MACA_21m_F_LUNG_55,MACA_21m_F_LUNG_55__Lung,171024_A00111_0078_AH3YHCDMXX,CGGTTAACACCTATCC,1
GCACTCTCAACGATCT-1-53-0-0,21m,MACA_21m_F_LUNG_55_GCACTCTCAACGATCT,type II pneumocyte,,Alveolar Epithelial Type 2,droplet,21-F-55,2703.0,female,,Lung,Lung,Mouse,Mus musculus,MACA_21m_F_LUNG_55,MACA_21m_F_LUNG_55__Lung,171024_A00111_0078_AH3YHCDMXX,GCACTCTCAACGATCT,1
CTACATTCACCGAATT-1-53-0-0,21m,MACA_21m_F_LUNG_55_CTACATTCACCGAATT,type II pneumocyte,,Alveolar Epithelial Type 2,droplet,21-F-55,2414.0,female,,Lung,Lung,Mouse,Mus musculus,MACA_21m_F_LUNG_55,MACA_21m_F_LUNG_55__Lung,171024_A00111_0078_AH3YHCDMXX,CTACATTCACCGAATT,1


In [18]:
mouse_minitest_df = mouse_minitest.to_df()
mouse_minitest_df

index,Xkr4,Rp1,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Oprk1,Npbwr1,Rb1cc1,Fam150a,St18,Pcmtd1,Sntg1,Rrs1,Adhfe1,2610203C22Rik,3110035E14Rik,Mybl1,Vcpip1,1700034P13Rik,Sgk3,6030422M02Rik,Snhg6,Ppp1r42,Cops5,Cspp1,Arfgef1,Cpa6,Prex2,A830018L16Rik,4932411L15,Sulf1,Slco5a1,Prdm14,Ncoa2,Tram1,Lactb2,Xkr9,Gm5523,Eya1,Msc,Trpa1,Kcnb2,Terf1,Gm106,4930444P10Rik,Rpl7,Rdh10,...,Bmx,Pir,Figf,Piga,Asb11,Asb9,Mospd2,Fancb,Glra2,Gemin8,Gpm6b,Ofd1,Trappc2,Rab9,Tceanc,Egfl6,Gm1720,Gm8817,Tmsb4x,Tlr8,Tlr7,Prps2,Frmpd4,Msl3,Arhgap6,Amelx,Hccs,Mid1,4933400A11Rik,G530011O06Rik,Asmt,Vamp7,Spry3,Tmlhe,Zf12,Zfy1,Ube1y1,Kdm5d,Eif2s3y,Uty,Ddx3y,Usp9y,Rbmy1a1,LOC100039753,Rbm31y,LOC434960,LOC380994,LOC100041346,Sly,Erdr1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
ACCTTTAAGGGCTCTC-1-53-0-0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,7.0,0.0,26.0,0.0,0.0,202.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
CACACAATCCTCAACC-1-53-0-0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,4.0,0.0,0.0,167.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
CGGTTAACACCTATCC-1-53-0-0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
GCACTCTCAACGATCT-1-53-0-0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,130.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
CTACATTCACCGAATT-1-53-0-0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,3.0,0.0,0.0,179.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCACTCTTCGGTCCGA-1-53-0-0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,242.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CAAGAAAGTTGACGTT-1-53-0-0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GACCAATTCACAGTAC-1-53-0-0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,412.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
GGGTTGCTCCGCAGTG-1-53-0-0,0.0,0.0,0.0,1.0,4.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,251.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
GCGCCAAAGTAAGTAC-1-53-0-0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
mouse_n_umi = mouse_minitest_df.sum(axis=1).astype(int)
mouse_n_umi

index
ACCTTTAAGGGCTCTC-1-53-0-0    17230
CACACAATCCTCAACC-1-53-0-0     9884
CGGTTAACACCTATCC-1-53-0-0    10837
GCACTCTCAACGATCT-1-53-0-0     9560
CTACATTCACCGAATT-1-53-0-0     8719
GCACTCTTCGGTCCGA-1-53-0-0    11296
CAAGAAAGTTGACGTT-1-53-0-0     8138
GACCAATTCACAGTAC-1-53-0-0     8763
GGGTTGCTCCGCAGTG-1-53-0-0     8352
GCGCCAAAGTAAGTAC-1-53-0-0     6732
dtype: int64

# Read kmermaid translate outdira

In [20]:
kmermaid_outidr = '/mnt/data_lg/data_sm_copy/olga/tabula-microcebus/analyses/kmermaid/mini-test-at2-vs-macrophages/remove_ribo_ksize_mammals'
! ls -lha $kmermaid_outidr

ls: cannot access '/mnt/data_lg/data_sm_copy/olga/tabula-microcebus/analyses/kmermaid/mini-test-at2-vs-macrophages/remove_ribo_ksize_mammals': No such file or directory


In [21]:
translate_dir = f"{kmermaid_outidr}/translate"
translate_dir

'/mnt/data_lg/data_sm_copy/olga/tabula-microcebus/analyses/kmermaid/mini-test-at2-vs-macrophages/remove_ribo_ksize_mammals/translate'

In [22]:
! ls -lha $translate_dir | head

ls: cannot access '/mnt/data_lg/data_sm_copy/olga/tabula-microcebus/analyses/kmermaid/mini-test-at2-vs-macrophages/remove_ribo_ksize_mammals/translate': No such file or directory


In [23]:
is_aligneds = 'aligned', 'unaligned'

seriess = []

for is_aligned in is_aligneds:
    series = adata.obs.apply(lambda x: '{channel_updated}__{is_aligned}__{cell_barcode}'.format(
        is_aligned=is_aligned, **x), axis=1)
    adata.obs[f'fasta_id__{is_aligned}'] = series
    seriess.append(series)
cells = pd.concat(seriess)
cells

P1_2_TGCTACCCATTAGCCA-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TGC...
P1_2_ACACCGGAGTTACGGG-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...
P1_2_CCATGTCAGCTACCTA-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CCA...
P1_2_CAGCCGAAGGAGCGTT-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAG...
P1_2_TATGCCCAGGATGGAA-human                      HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TAT...
Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG-lemur    ANTOINE_LUNG_EPCAM_POS__aligned__CCCAATCGTTGTCGCG
Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA-lemur    ANTOINE_LUNG_EPCAM_POS__aligned__TAAGTGCAGCCCGAAA
Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC-lemur    ANTOINE_LUNG_EPCAM_POS__aligned__TCTCATACATTCCTGC
Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT-lemur    ANTOINE_LUNG_EPCAM_POS__aligned__AGATCTGTCCGCGTTT
Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA-lemur    ANTOINE_LUNG_EPCAM_POS__aligned__TTAACTCGTAAGGGAA
ACCTTTAAGG

In [24]:
adata.obs.query("species == 'Human'")

Unnamed: 0,age,batch,cell,cell_barcode,cell_ontology_class,cell_ontology_id,channel,channel_no_10x,channel_tissue,compartment,compartment_id,free_annotation,individual,location,magnetic.selection,method,mouse.id,nGene,nUMI,n_barcodes,n_counts,n_genes,orig.ident,original_channel,patient,percent.ribo,possibly_contaminated_barcode,preparation.site,region,sample,sequencing_run,sex,species,species_batch,species_latin,subtissue,tissue,tissue_free_annotation,compartment_updated_from_human,compartment_updated_from_human__with_species,compartment_updated_from_lemur,compartment_updated_from_lemur__with_species,narrow_group,broad_group,compartment_group,log_counts,sqrt_counts,channel_original,peptides_aligned,peptides_unaligned,channel_updated,cell_id_no_species,fasta_id__aligned,fasta_id__unaligned
P1_2_TGCTACCCATTAGCCA-human,,,,TGCTACCCATTAGCCA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4931.0,32187.0,,3299,3299,Human Lung 10x - Patient 1,,1.0,0.121198,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.668588,125.749748,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TGC...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_TGCTACCCATTAGCCA,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TGC...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...
P1_2_ACACCGGAGTTACGGG-human,,,,ACACCGGAGTTACGGG,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4510.0,28732.0,,3030,3030,Human Lung 10x - Patient 1,,1.0,0.125331,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.614071,122.368294,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_ACACCGGAGTTACGGG,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...
P1_2_CCATGTCAGCTACCTA-human,,,,CCATGTCAGCTACCTA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4490.0,31702.0,,3014,3014,Human Lung 10x - Patient 1,,1.0,0.156457,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.865422,138.755173,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CCA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_CCATGTCAGCTACCTA,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CCA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...
P1_2_CAGCCGAAGGAGCGTT-human,,,,CAGCCGAAGGAGCGTT,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4333.0,28696.0,,2965,2965,Human Lung 10x - Patient 1,,1.0,0.198146,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.720886,129.081375,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_CAGCCGAAGGAGCGTT,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...
P1_2_TATGCCCAGGATGGAA-human,,,,TATGCCCAGGATGGAA,type II pneumocyte,,P1_2,,,epithelial,,Alveolar Epithelial Type 2,,distal,epithelial,,,4147.0,19244.0,,2765,2765,Human Lung 10x - Patient 1,,1.0,0.151736,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,epithelial,epithelial (Human),epithelial,epithelial (Human),Alveolar Epithelial Type 2,Alveolar Epithelial Type 2,epithelial,9.359708,107.754349,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TAT...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_TATGCCCAGGATGGAA,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TAT...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...
P1_2_CACACAAGTCCTAGCG-human,,,,CACACAAGTCCTAGCG,macrophage,,P1_2,,,immune,,Macrophage,,distal,epithelial,,,5850.0,61243.0,,3905,3905,Human Lung 10x - Patient 1,,1.0,0.11198,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,myeloid,myeloid (Human),myeloid,myeloid (Human),Macrophage,Macrophage,myeloid,10.345412,176.391617,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAC...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_CACACAAGTCCTAGCG,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__CAC...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__C...
P1_2_TTGCGTCTCCAAGTAC-human,,,,TTGCGTCTCCAAGTAC,macrophage,,P1_2,,,immune,,Macrophage,,distal,epithelial,,,5499.0,55094.0,,3645,3645,Human Lung 10x - Patient 1,,1.0,0.131321,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,myeloid,myeloid (Human),myeloid,myeloid (Human),Macrophage,Macrophage,myeloid,10.215703,165.31485,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TTG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_TTGCGTCTCCAAGTAC,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__TTG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__T...
P1_2_ACAGCCGAGCTAGTTC-human,,,,ACAGCCGAGCTAGTTC,macrophage,,P1_2,,,immune,,Macrophage,,distal,epithelial,,,5117.0,44566.0,,3416,3416,Human Lung 10x - Patient 1,,1.0,0.107952,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,myeloid,myeloid (Human),myeloid,myeloid (Human),Macrophage,Macrophage,myeloid,10.049361,152.121658,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_ACAGCCGAGCTAGTTC,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACA...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...
P1_2_AGGTCCGCAGGACGTA-human,,,,AGGTCCGCAGGACGTA,macrophage,,P1_2,,,immune,,Macrophage,,distal,epithelial,,,5123.0,43260.0,,3400,3400,Human Lung 10x - Patient 1,,1.0,0.143967,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,myeloid,myeloid (Human),myeloid,myeloid (Human),Macrophage,Macrophage,myeloid,9.898576,141.074448,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__AGG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_AGGTCCGCAGGACGTA,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__AGG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...
P1_2_ACGCCAGGTGTCTGAT-human,,,,ACGCCAGGTGTCTGAT,macrophage,,P1_2,,,immune,,Macrophage,,distal,epithelial,,,5131.0,45539.0,,3393,3393,Human Lung 10x - Patient 1,,1.0,0.110652,,biohub,normal,distal 1a,171205_A00111_0088_BH55NYDMXX,,Human,human,Homo sapiens,,Lung,,myeloid,myeloid (Human),myeloid,myeloid (Human),Macrophage,Macrophage,myeloid,10.062412,153.117599,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31,P1_2_ACGCCAGGTGTCTGAT,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__aligned__ACG...,HUMAN_HCA_LUNG_LUNG_CD45_OR_CD31__unaligned__A...


In [25]:
adata_df = adata.to_df()
adata_df.head()

Unnamed: 0,SAMD11,NOC2L,KLHL17,ISG15,AGRN,C1orf159,TTLL10,TNFRSF18,TNFRSF4,SDF4,UBE2J2,ACAP3,PUSL1,TAS1R3,DVL1,MXRA8,AURKAIP1,CCNL2,MRPL20,TMEM88B,VWA1,ATAD3A,SSU72,MIB2,MMP23B,SLC35E2B,NADK,GNB1,TMEM52,GABRD,PRKCZ,SKI,MORN1,RER1,PEX10,PLCH2,PANK4,HES5,PRDM16,ARHGEF16,TPRG1L,TP73,LRRC47,C1orf174,NPHP4,KCNAB2,CHD5,RNF207,ICMT,GPR153,...,SLITRK4,SLITRK2,FMR1,AFF2,MAMLD1,MTM1,MTMR1,CD99L2,VMA21,PRRG3,FATE1,GABRE,GABRA3,GABRQ,NSDHL,HAUS7,BGN,ATP2B3,DUSP9,BCAP31,ABCD1,SRPK3,IDH3G,SSR4,PDZD4,L1CAM,AVPR2,ARHGAP4,NAA10,HCFC1,IRAK1,MECP2,TKTL1,FLNA,EMD,DNASE1L1,TAZ,GDI1,FAM50A,PLXNA3,SLC10A3,FAM3A,IKBKG,GAB3,MPP1,MTCP1,VBP1,RAB39B,VAMP7,ZFY
P1_2_TGCTACCCATTAGCCA-human,0.0,1.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0,13.0,0.0,2.0,0.0,6.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,7.0,1.0,0.0,2.0,1.0,3.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
P1_2_ACACCGGAGTTACGGG-human,0.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,4.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
P1_2_CCATGTCAGCTACCTA-human,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,13.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
P1_2_CAGCCGAAGGAGCGTT-human,0.0,1.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,6.0,0.0,4.0,0.0,7.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,12.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P1_2_TATGCCCAGGATGGAA-human,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,8.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [26]:
n_counts = adata_df.sum(axis=1).astype(int)
n_counts

P1_2_TGCTACCCATTAGCCA-human                      15813
P1_2_ACACCGGAGTTACGGG-human                      14974
P1_2_CCATGTCAGCTACCTA-human                      19253
P1_2_CAGCCGAAGGAGCGTT-human                      16662
P1_2_TATGCCCAGGATGGAA-human                      11611
Antoine_Lung_EPCAM_10X_CCCAATCGTTGTCGCG-lemur    13136
Antoine_Lung_EPCAM_10X_TAAGTGCAGCCCGAAA-lemur    14273
Antoine_Lung_EPCAM_10X_TCTCATACATTCCTGC-lemur     9619
Antoine_Lung_EPCAM_10X_AGATCTGTCCGCGTTT-lemur    17958
Antoine_Lung_EPCAM_10X_TTAACTCGTAAGGGAA-lemur    10556
ACCTTTAAGGGCTCTC-1-53-0-0-mouse                  11404
CACACAATCCTCAACC-1-53-0-0-mouse                   5665
CGGTTAACACCTATCC-1-53-0-0-mouse                   6113
GCACTCTCAACGATCT-1-53-0-0-mouse                   5326
CTACATTCACCGAATT-1-53-0-0-mouse                   4815
P1_2_CACACAAGTCCTAGCG-human                      31114
P1_2_TTGCGTCTCCAAGTAC-human                      27329
P1_2_ACAGCCGAGCTAGTTC-human                      23141
P1_2_AGGTC

In [None]:
sc.pp.filter_cells(adata, min_genes=1, inplace=False)

In [None]:
outdir = '/mnt/data_lg/data_sm_copy/olga/tabula-microcebus/analyses/kmermaid/mini-test-at2-vs-macrophages/30cell-subset-softlinks/'
! mkdir -p $outdir

In [None]:
for cell in cells.values:
    filenames = glob.glob(f'{translate_dir}/{cell}*')
    print(f'{cell}: {len(filenames)}')
    for filename in filenames:
        ! ln -s $filename $outdir

In [None]:
ls -1 $outdir | wc -l

In [41]:
f'{4 ** 21:e}'

'4.398047e+12'

In [42]:
f'{20 ** 7:e}'

'1.280000e+09'