1. Adding metadata to a cisTopic object

# Environment

In [1]:
# Standard library imports
import os
import gc
import pickle
import importlib

# Data manipulation imports
import pandas as pd
import scanpy as sc
import scrublet as scr
import polars as pl

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# pycisTopic imports
import pycisTopic
from pycisTopic.lda_models import run_cgs_models_mallet, evaluate_models
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
from pycisTopic.iterative_peak_calling import get_consensus_peaks
from pycisTopic.plotting.qc_plot import plot_sample_stats, plot_barcode_stats
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
from pycisTopic.utils import fig2img
from pycisTopic.topic_binarization import binarize_topics
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge
from pycisTopic.qc import get_barcodes_passing_qc_for_sample
from pycisTopic.clust_vis import (
    find_clusters,
    run_umap,
    run_tsne,
    plot_metadata,
    plot_topic,
    cell_topic_heatmap
)

importlib.reload(pycisTopic)
from pycisTopic import *
pycisTopic.__version__

sys.path.insert(0, "/home/michal.kubacki/Githubs/Re-MEND/code/External_Datasets/GeneSet_Derivation/Herring_scenic/helpers")
import config
importlib.reload(config)
from config import *
n_cpu = 32

In [2]:
#################################################################
reference = "hg19"


# neurons_set = "all_excitatory"
# neurons_set = "all_inhibitory"
neurons_set = "all_excitatory_all_ages"
# neurons_set = "all_inhibitory_all_ages"

cells_dict = {
    "all_inhibitory"            :   ['SST', 'VIP', 'MGE_dev'],
    "all_inhibitory_all_ages"   :   ['VIP', 'SST', 'PV', 'MGE_dev'],
    "all_excitatory"            :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "all_excitatory_all_ages"   :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev']
}

ages_dict = {
    "all_inhibitory"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_inhibitory_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24'],
    "all_excitatory"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_excitatory_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24']
}

out_dir, in_dir, root_dir, tmp_dir, data_folder = set_output_folders(reference, neurons_set)

sel_celltypes  = cells_dict[neurons_set]
sel_ages = ages_dict[neurons_set]

#################################################################

root_dir: /group/testa/michal.kubacki/herring
out_dir: /group/testa/michal.kubacki/herring/output_hg19_all_excitatory
in_dir: /group/testa/michal.kubacki/herring/data
tmp_dir: /group/testa/michal.kubacki/herring/tmp


In [3]:
fragments_dict = select_files(reference, selected_fragments = sel_ages)

All fragments: {'ga22': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138510_RL2366_ga22_snATAC_fragments.tsv.gz', '1y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138526_RL2209_1y_snATAC_fragments.tsv.gz', '14y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138542_RL2372_14y_snATAC_fragments.tsv.gz', 'ga24': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138512_RL2207_ga24_snATAC_fragments.tsv.gz', '2y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138529_RL1784_2y_snATAC_fragments.tsv.gz', '16y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138544_RL1785_16y_snATAC_fragments.tsv.gz', '1m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138515_RL2367_1m_snATAC_fragments.tsv.gz', '4y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138532_RL2210_4y_snATAC_fragments.tsv.gz', '20y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138548_RL2085_20y_snATAC_fragment

# Adding metadata to a cisTopic object

In [4]:
file_path = os.path.join(out_dir, "cistopic_obj_merged.pkl") 

with open(file_path, "rb") as file:
    cistopic_obj = pickle.load(file)

In [5]:
cistopic_obj.cell_data.index = cistopic_obj.cell_data.index.str.replace(r'___.*', '', regex=True)

In [6]:
cistopic_obj.cell_data.head()

Unnamed: 0,cisTopic_nr_frag,cisTopic_log_nr_frag,cisTopic_nr_acc,cisTopic_log_nr_acc,sample_id,barcode_rank,total_fragments_count,log10_total_fragments_count,unique_fragments_count,log10_unique_fragments_count,...,log10_unique_fragments_in_peaks_count,fraction_of_fragments_in_peaks,duplication_count,duplication_ratio,nucleosome_signal,tss_enrichment,pdf_values_for_tss_enrichment,pdf_values_for_fraction_of_fragments_in_peaks,pdf_values_for_duplication_ratio,barcode
ATTTGTCCATAGATAG-1-1m,3215,3.507181,3074,3.487704,1m,5810,8186,3.913125,6659,3.823474,...,3.496238,0.470641,1527,0.186538,0.717724,1.851727,0.070243,0.114913,0.792464,ATTTGTCCATAGATAG-1
TAGCACACATTTCTTG-1-1m,4273,3.630733,4036,3.605951,1m,2186,19034,4.279553,16218,4.210024,...,3.621072,0.257615,2816,0.147946,0.572316,1.928305,0.068753,0.380106,1.360023,TAGCACACATTTCTTG-1
GTTATGGCAAACCCTA-1-1m,7632,3.882638,7021,3.846399,1m,2723,17875,4.25227,14394,4.158212,...,3.872972,0.51848,3481,0.194741,0.583719,1.862601,0.073943,0.055935,0.626418,GTTATGGCAAACCCTA-1
CTCTCGAAGTTATGAG-1-1m,3158,3.499412,2948,3.469527,1m,4643,12184,4.085826,9464,3.976121,...,3.490099,0.3265,2720,0.223244,0.515011,1.960423,0.054337,0.465188,0.201394,CTCTCGAAGTTATGAG-1
GCTCACTTCCAAACCA-1-1m,10071,4.003073,9020,3.955207,1m,965,28998,4.462383,22432,4.350887,...,3.994361,0.439996,6566,0.226429,0.529242,2.08432,0.017637,0.169987,0.15142,GCTCACTTCCAAACCA-1


In [7]:
cell_data = pd.read_csv(os.path.join(out_dir, 'cells_data.csv'), index_col = 0)
cell_data.head()

Unnamed: 0,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,...,predictedCell,predictedGroup,predictedScore,FRIP,ReadsInPeaks,age,chem,major_clust,age_mapped,old_index
TTGCGGGCATTGCGAT-1-3m,3.187,1534.0,9410.0,981.0,0.052331,1.0,0.939366,8204.0,46360.0,89909.0,...,GCTACCTCAGCTTCCT-RL2100_86d_v3,L2/3_CUX2_dev-1,0.335881,0.420796,74502.0,86d,v3,L2-3_CUX2,3m,TTGCGGGCATTGCGAT-1
ACATGCATCAATTCCT-1-ga24,2.553,1012.0,7014.0,700.0,0.047747,1.0,0.786993,5692.0,41102.0,73449.0,...,TCCTTCTTCCTAAACG-RL2121_ga34_v3,L5/6_TLE4_SCUBE1,0.541803,0.384517,56111.0,ga34,v3,L5-6_TLE4,ga24,ACATGCATCAATTCCT-1
CCGTGAGCAGGTAGCA-1-6m,2.914,1167.0,7173.0,1268.0,0.050797,1.0,0.954166,6839.0,36130.0,70604.0,...,CCTTTGGGTGTATTCG-RL2108_179d_v3,L5/6_TLE4_SORCS1,0.439298,0.398352,54136.0,179d,v3,L5-6_TLE4,6m,CCGTGAGCAGGTAGCA-1
TTACTCACAACTCCCT-1-ga24,2.684,1095.0,7214.0,1044.0,0.051844,1.0,0.771999,7073.0,39263.0,69574.0,...,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.372978,0.361398,49385.0,ga34,v3,L5-6_TLE4,ga24,TTACTCACAACTCCCT-1
CCACAGGAGACACGGT-1-ga24,3.119,1208.0,7018.0,737.0,0.052616,1.0,1.795448,8900.0,23857.0,66691.0,...,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.326004,0.442014,57643.0,ga34,v3,L5-6_TLE4,ga24,CCACAGGAGACACGGT-1


In [8]:
print(cell_data.shape)
cell_data = cell_data[~cell_data.index.duplicated()]
print(cell_data.shape)

(22558, 25)
(22474, 25)


In [9]:
print(cistopic_obj.cell_data.index.is_unique)
print(cell_data.index.is_unique)

True
True


In [10]:
cistopic_obj.add_cell_data(cell_data, split_pattern='-')
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)


