In [1]:
# Standard library imports
import os
import sys
import pickle

# Data manipulation imports
import pandas as pd
import scanpy as sc

sys.path.insert(0, "/home/michal.kubacki/Githubs/Re-MEND/code/External_Datasets/GeneSet_Derivation/Herring_scenic/helpers")
import config
from config import *

In [2]:
#################################################################
reference = "hg19"

# neurons_set = "all_excitatory"
# neurons_set = "all_inhibitory"
neurons_set = "all_excitatory_all_ages"
# neurons_set = "all_inhibitory_all_ages"

cells_dict = {
    "all_inhibitory"            :   ['SST', 'VIP', 'MGE_dev'],
    "all_inhibitory_all_ages"   :   ['VIP', 'SST', 'PV', 'MGE_dev'],
    "all_excitatory"            :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "all_excitatory_all_ages"   :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev']
}

ages_dict = {
    "all_inhibitory"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_inhibitory_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24'],
    "all_excitatory"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_excitatory_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24']
}

out_dir, in_dir, root_dir, tmp_dir, data_folder = set_output_folders(reference, neurons_set)

sel_celltypes  = cells_dict[neurons_set]
sel_ages = ages_dict[neurons_set]

#################################################################

root_dir: /group/testa/michal.kubacki/herring
out_dir: /group/testa/michal.kubacki/herring/output_hg19_all_excitatory
in_dir: /group/testa/michal.kubacki/herring/data
tmp_dir: /group/testa/michal.kubacki/herring/tmp


In [3]:
cell_data = pd.read_csv(os.path.join(out_dir, 'cells_data.csv'), index_col = 0)

In [4]:
adata = sc.read_h5ad(os.path.join(out_dir,  "adata.h5ad"))

In [5]:
adata.obs.sample_id.head()

AAACCTGAGAGTCGGT-1-1m    AAACCTGAGAGTCGGT-1-1m
AAACCTGAGCCGCCTA-1-1m    AAACCTGAGCCGCCTA-1-1m
AAACCTGAGTGAACAT-1-1m    AAACCTGAGTGAACAT-1-1m
AAACCTGCAGGCGATA-1-1m    AAACCTGCAGGCGATA-1-1m
AAACCTGCAGTGGGAT-1-1m    AAACCTGCAGTGGGAT-1-1m
                                 ...          
TTTCACAAGTTGGAAT-1-4y    TTTCACAAGTTGGAAT-1-4y
TTTCACATCGATAACC-1-4y    TTTCACATCGATAACC-1-4y
TTTGACTTCAGACTGT-1-4y    TTTGACTTCAGACTGT-1-4y
TTTGGTTGTAGGTTTC-1-4y    TTTGGTTGTAGGTTTC-1-4y
TTTGTTGAGCATTGAA-1-4y    TTTGTTGAGCATTGAA-1-4y
Name: sample_id, Length: 45539, dtype: category
Categories (45524, object): ['AAACCCAAGAGGCCAT-1-3m', 'AAACCCAAGAGTCTTC-1-ga22', 'AAACCCAAGATACAGT-1-ga24', 'AAACCCAAGATGTTCC-1-2y', ..., 'TTTGTTGTCGTGAGAG-1-ga24', 'TTTGTTGTCTACCCAC-1-1m', 'TTTGTTGTCTCCTACG-1-ga22', 'TTTGTTGTCTGGAGAG-1-10m']

In [6]:
cell_data.head()

Unnamed: 0,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,...,predictedCell,predictedGroup,predictedScore,FRIP,ReadsInPeaks,age,chem,major_clust,age_mapped,old_index
TTGCGGGCATTGCGAT-1-3m,3.187,1534.0,9410.0,981.0,0.052331,1.0,0.939366,8204.0,46360.0,89909.0,...,GCTACCTCAGCTTCCT-RL2100_86d_v3,L2/3_CUX2_dev-1,0.335881,0.420796,74502.0,86d,v3,L2-3_CUX2,3m,TTGCGGGCATTGCGAT-1
ACATGCATCAATTCCT-1-ga24,2.553,1012.0,7014.0,700.0,0.047747,1.0,0.786993,5692.0,41102.0,73449.0,...,TCCTTCTTCCTAAACG-RL2121_ga34_v3,L5/6_TLE4_SCUBE1,0.541803,0.384517,56111.0,ga34,v3,L5-6_TLE4,ga24,ACATGCATCAATTCCT-1
CCGTGAGCAGGTAGCA-1-6m,2.914,1167.0,7173.0,1268.0,0.050797,1.0,0.954166,6839.0,36130.0,70604.0,...,CCTTTGGGTGTATTCG-RL2108_179d_v3,L5/6_TLE4_SORCS1,0.439298,0.398352,54136.0,179d,v3,L5-6_TLE4,6m,CCGTGAGCAGGTAGCA-1
TTACTCACAACTCCCT-1-ga24,2.684,1095.0,7214.0,1044.0,0.051844,1.0,0.771999,7073.0,39263.0,69574.0,...,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.372978,0.361398,49385.0,ga34,v3,L5-6_TLE4,ga24,TTACTCACAACTCCCT-1
CCACAGGAGACACGGT-1-ga24,3.119,1208.0,7018.0,737.0,0.052616,1.0,1.795448,8900.0,23857.0,66691.0,...,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.326004,0.442014,57643.0,ga34,v3,L5-6_TLE4,ga24,CCACAGGAGACACGGT-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCTCGAGGTTCCAATG-1-ga24,2.178,22.0,69.0,6.0,0.048319,1.0,1.615385,66.0,273.0,714.0,...,CTTTCGGAGTAAACAC-RL2107_ga24_v3,L2/3_CUX2_dev-fetal,0.574307,0.401288,561.0,ga24,v3,L2-3_CUX2,ga24,GCTCGAGGTTCCAATG-1
TTACTCAAGTCCGATT-1-1m,2.277,23.0,90.0,6.0,0.083333,1.0,1.389381,49.0,226.0,540.0,...,CGTAAGTGTGACGCCT-RL1777_2d_v3,L5/6_TLE4_SORCS1,0.364590,0.443182,468.0,2d,v3,L5-6_TLE4,1m,TTACTCAAGTCCGATT-1
GCCCGAAGTCGACTGC-1-4y,2.871,29.0,88.0,8.0,0.096491,1.0,2.234043,56.0,141.0,456.0,...,AAGGAATGTTCCTAGA-RL2109_4yr_v3,L4_RORB_MET,0.199647,0.468750,420.0,4yr,v3,L4_RORB,4y,GCCCGAAGTCGACTGC-1
GCGTAGCGTGCTGTGC-1-ga24,2.772,28.0,84.0,4.0,0.103960,1.0,1.948905,49.0,137.0,404.0,...,CTTTCAAAGACGACGT-RL2107_ga24_v3,L5/6_THEMIS_dev-1,0.572854,0.447103,355.0,ga24,v3,L5-6_THEMIS,ga24,GCGTAGCGTGCTGTGC-1


In [7]:
cell_data.index.head()

Index(['TTGCGGGCATTGCGAT-1-3m', 'ACATGCATCAATTCCT-1-ga24',
       'CCGTGAGCAGGTAGCA-1-6m', 'TTACTCACAACTCCCT-1-ga24',
       'CCACAGGAGACACGGT-1-ga24', 'GCTGTTCTCCTTCGAC-1-ga22',
       'AGCCTCTGTTTGTCTT-1-ga22', 'GCTTTCGTCACTCCCA-1-ga24',
       'TCCGACTCAAGCAATA-1-10m', 'CTAGGATCATGGAGGT-1-1m',
       ...
       'TTCATCAAGTGCCCTG-1-4y', 'TCACCTGTCAAACCAC-1-ga24',
       'TAACTTCTCGTGAACT-1-ga22', 'GAGGATGCAGTATCTG-1-1m',
       'CCTCCCTAGTTACCAC-1-ga24', 'GCTCGAGGTTCCAATG-1-ga24',
       'TTACTCAAGTCCGATT-1-1m', 'GCCCGAAGTCGACTGC-1-4y',
       'GCGTAGCGTGCTGTGC-1-ga24', 'AACGGGAAGCGTATCT-1-ga24'],
      dtype='object', length=22558)

In [8]:
adata.obs.index.head()

Index(['AAACCTGAGAGTCGGT-1-1m', 'AAACCTGAGCCGCCTA-1-1m',
       'AAACCTGAGTGAACAT-1-1m', 'AAACCTGCAGGCGATA-1-1m',
       'AAACCTGCAGTGGGAT-1-1m', 'AAACCTGGTCATGCCG-1-1m',
       'AAACCTGGTCGGCATC-1-1m', 'AAACCTGTCATAGCAC-1-1m',
       'AAACCTGTCCTGCTTG-1-1m', 'AAACGGGAGAGGACGG-1-1m',
       ...
       'TTGGGTAGTTGCAACT-1-4y', 'TTGTGGAGTGGCTTAT-1-4y',
       'TTGTTGTCAGTATGAA-1-4y', 'TTTACGTAGAAATTGC-1-4y',
       'TTTATGCGTTCAGGTT-1-4y', 'TTTCACAAGTTGGAAT-1-4y',
       'TTTCACATCGATAACC-1-4y', 'TTTGACTTCAGACTGT-1-4y',
       'TTTGGTTGTAGGTTTC-1-4y', 'TTTGTTGAGCATTGAA-1-4y'],
      dtype='object', length=45539)

In [9]:
cell_data.index.intersection(adata.obs.index)

Index(['GCTTTCGTCATCGCTC-1-ga22', 'TGATTTCGTTTGATCG-1-ga24',
       'TGAGCCGTCAAAGTAG-1-1m', 'TCAGTTTAGCTCCATA-1-1m',
       'TAAGTGCGTGATAAGT-1-1m', 'AAGATAGGTGTTTCTT-1-ga24',
       'ATCCCTGCAATGAAAC-1-ga24'],
      dtype='object')

In [10]:
cell_data.index.intersection(adata.obs.sample_id)

Index(['GCTTTCGTCATCGCTC-1-ga22', 'TGATTTCGTTTGATCG-1-ga24',
       'TGAGCCGTCAAAGTAG-1-1m', 'TCAGTTTAGCTCCATA-1-1m',
       'TAAGTGCGTGATAAGT-1-1m', 'AAGATAGGTGTTTCTT-1-ga24',
       'ATCCCTGCAATGAAAC-1-ga24'],
      dtype='object')