In [None]:
import os
import pycisTopic
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd

load_dotenv()


In [None]:
out_dir = Path(os.getenv("OUTPUT_PATH")) / "garcia_ATAC/atac_preprocessing_allcelltypes"
os.makedirs(out_dir, exist_ok=True)
path_to_regions = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed")
path_to_blacklist = str(Path(os.getenv("RESOURCES_PATH")) / "scenicplus/hg38-blacklist.v2.bed")
pycistopic_qc_output_dir = os.path.join(out_dir, "qc")

DATA_PATH = Path(os.getenv("DATA_PATH")) / "garcia_ATAC"
fragments_dict = {
    "FCA_GND10287600": str(DATA_PATH / "FCA_GND10287600_atac_fragments.tsv.gz"),
    "FCA_GND10287601": str(DATA_PATH / "FCA_GND10287601_atac_fragments.tsv.gz"),
    "FCA_GND10287602": str(DATA_PATH / "FCA_GND10287602_atac_fragments.tsv.gz"),
    "FCA_GND10287603": str(DATA_PATH / "FCA_GND10287603_atac_fragments.tsv.gz"),
    "FCA_GND10287604": str(DATA_PATH / "FCA_GND10287604_atac_fragments.tsv.gz"),
    "HCA_F_GON10535495": str(DATA_PATH / "HCA_F_GON10535495_atac_fragments.tsv.gz"),
    "HCA_F_GON10713284": str(DATA_PATH / "HCA_F_GON10713284_atac_fragments.tsv.gz"),
    "HCA_F_GON10713285": str(DATA_PATH / "HCA_F_GON10713285_atac_fragments.tsv.gz"),
    "HCA_F_GON10713286": str(DATA_PATH / "HCA_F_GON10713286_atac_fragments.tsv.gz")
}

In [None]:
# Load cell types
cell_data = pd.read_csv(DATA_PATH / "celltype_predictions/all_celltypes.csv", index_col=0)
#cell_data = pd.read_csv(DATA_PATH / "celltype_predictions/germcell_types.csv", index_col=0)
cell_data["sample"] = cell_data['sample'].str[:-1]
cell_data = cell_data[cell_data["sample"].isin(fragments_dict.keys())]


In [None]:
cell_data.shape

In [None]:
# Merge cell barcode with sample id
cell_data.index = cell_data.index + "___" + cell_data["sample"]

In [None]:
cell_data.celltype.value_counts(), cell_data.shape[0]

In [None]:
chromsizes = pd.read_table(
    "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes",
    header = None,
    names = ["Chromosome", "End"]
)
chromsizes.insert(1, "Start", 0)
chromsizes.head()

In [None]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk
os.makedirs(out_dir / "consensus_peak_calling", exist_ok = True)
os.makedirs(out_dir / "consensus_peak_calling/pseudobulk_bed_files", exist_ok = True)
os.makedirs(out_dir / "consensus_peak_calling/pseudobulk_bw_files", exist_ok = True)

bw_paths, bed_paths = export_pseudobulk(
    input_data = cell_data,
    variable = "celltype",
    sample_id_col = "sample",
    chromsizes = chromsizes,
    bed_path = out_dir / "consensus_peak_calling/pseudobulk_bed_files",
    bigwig_path = out_dir / "consensus_peak_calling/pseudobulk_bw_files",
    path_to_fragments = fragments_dict,
    n_cpu = 3,
    normalize_bigwig = True,
    temp_dir = "/tmp")

In [None]:
with open(os.path.join(out_dir, "consensus_peak_calling/bw_paths.tsv"), "wt") as f:
    for v in bw_paths:
        _ = f.write(f"{v}\t{bw_paths[v]}\n")

In [None]:
with open(os.path.join(out_dir, "consensus_peak_calling/bed_paths.tsv"), "wt") as f:
    for v in bed_paths:
        _ = f.write(f"{v}\t{bed_paths[v]}\n")

# Infer consensus peaks


In [None]:
bw_paths = {}
with open(os.path.join(out_dir, "consensus_peak_calling/bw_paths.tsv")) as f:
    for line in f:
        v, p = line.strip().split("\t")
        bw_paths.update({v: p})

In [None]:
bed_paths = {}
with open(os.path.join(out_dir, "consensus_peak_calling/bed_paths.tsv")) as f:
    for line in f:
        v, p = line.strip().split("\t")
        bed_paths.update({v: p})

In [None]:
from pycisTopic.pseudobulk_peak_calling import peak_calling
macs_path = "macs2"

os.makedirs(os.path.join(out_dir, "consensus_peak_calling/MACS"), exist_ok = True)

narrow_peak_dict = peak_calling(
    macs_path = macs_path,
    bed_paths = bed_paths,
    outdir = os.path.join(os.path.join(out_dir, "consensus_peak_calling/MACS")),
    genome_size = 'hs',
    n_cpu = 20,
    input_format = 'BEDPE',
    shift = 73,
    ext_size = 146,
    keep_dup = 'all',
    q_value = 0.05,
    _temp_dir = '/tmp'
)

Consensus peaks

In [None]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks
# Other param
peak_half_width = 250
# Get consensus peaks
consensus_peaks = get_consensus_peaks(
    narrow_peaks_dict = narrow_peak_dict,
    peak_half_width = peak_half_width,
    chromsizes = chromsizes,
    path_to_blacklist = path_to_blacklist)

In [None]:
# Convert to DataFrame and write directly
df = consensus_peaks.as_df()
df.to_csv(
    os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed"),
    sep='\t',
    index=False,
    header=False
)

In [None]:
#!pycistopic tss gene_annotation_list | grep Human

In [None]:
!mkdir -p {out_dir}/qc
!pycistopic tss get_tss \
    --output {out_dir}/qc/tss.bed \
    --name "hsapiens_gene_ensembl" \
    --to-chrom-source ucsc \
    --ucsc hg38

In [None]:
!head {out_dir}/qc/tss.bed | column -t

In [None]:
regions_bed_filename = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed")
tss_bed_filename = os.path.join(out_dir, "qc", "tss.bed")

pycistopic_qc_commands_filename = "pycistopic_qc_commands.txt"

# Create text file with all pycistopic qc command lines.
with open(pycistopic_qc_commands_filename, "w") as fh:
    for sample, fragment_filename in fragments_dict.items():
        print(
            "pycistopic qc",
            f"--fragments {fragment_filename}",
            f"--regions {regions_bed_filename}",
            f"--tss {tss_bed_filename}",
            f"--output {os.path.join(out_dir, 'qc')}/{sample}",
            sep=" ",
            file=fh,
        )

RUN from CL: cat pycistopic_qc_commands.txt | parallel -j 4 {}

In [None]:
from pycisTopic.plotting.qc_plot import plot_sample_stats, plot_barcode_stats
import matplotlib.pyplot as plt

In [None]:
for sample_id in fragments_dict:
    fig = plot_sample_stats(
        sample_id = sample_id,
        pycistopic_qc_output_dir = os.path.join(out_dir, "qc")
    )

In [None]:
from pycisTopic.qc import get_barcodes_passing_qc_for_sample
sample_id_to_barcodes_passing_filters = {}
sample_id_to_thresholds = {}
for sample_id in fragments_dict:
    (
        sample_id_to_barcodes_passing_filters[sample_id],
        sample_id_to_thresholds[sample_id]
    ) = get_barcodes_passing_qc_for_sample(
            sample_id = sample_id,
            pycistopic_qc_output_dir = os.path.join(out_dir, "qc"),
            unique_fragments_threshold = None, # use automatic thresholding
            tss_enrichment_threshold = None, # use automatic thresholding
            frip_threshold = 0,
            use_automatic_thresholds = True,
    )

In [None]:
for sample_id in fragments_dict:
    fig = plot_barcode_stats(
        sample_id = sample_id,
        pycistopic_qc_output_dir = os.path.join(out_dir, "qc"),
        bc_passing_filters = sample_id_to_barcodes_passing_filters[sample_id],
        detailed_title = False,
        **sample_id_to_thresholds[sample_id]
    )

In [None]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments
import polars as pl

cistopic_obj_list = []
for sample_id in fragments_dict:
    sample_metrics = pl.read_parquet(
        os.path.join(pycistopic_qc_output_dir, f'{sample_id}.fragments_stats_per_cb.parquet')
    ).to_pandas().set_index("CB").loc[ sample_id_to_barcodes_passing_filters[sample_id] ]
    cistopic_obj = create_cistopic_object_from_fragments(
        path_to_fragments = fragments_dict[sample_id],
        path_to_regions = path_to_regions,
        path_to_blacklist = path_to_blacklist,
        metrics = sample_metrics,
        valid_bc = sample_id_to_barcodes_passing_filters[sample_id],
        n_cpu = 20,
        project = sample_id    )
    cistopic_obj_list.append(cistopic_obj)

In [None]:
from pycisTopic.cistopic_class import merge
cistopic_obj = merge(cistopic_obj_list)
print(cistopic_obj)

In [None]:
import pickle
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)

In [None]:
cistopic_obj.add_cell_data(cell_data)

In [None]:
cistopic_obj.cell_data['sample'].unique()

In [None]:
cistopic_obj.cell_data.head()

In [None]:
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)

In [None]:
import pickle
cistopic_obj = pickle.load(open(os.path.join(out_dir, "cistopic_obj.pkl"), "rb"))

In [None]:
import scrublet as scr
scrub = scr.Scrublet(cistopic_obj.fragment_matrix.T, expected_doublet_rate=0.1)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
scrub.plot_histogram();
scrub.call_doublets(threshold=0.22)
scrub.plot_histogram();
scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], columns=cistopic_obj.cell_names, index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T


In [None]:
cistopic_obj.add_cell_data(scrublet)
sum(cistopic_obj.cell_data.Predicted_doublets_fragments == True)

In [None]:
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)

In [None]:
# Remove doublets
singlets = cistopic_obj.cell_data[cistopic_obj.cell_data.Predicted_doublets_fragments == False].index.tolist()
# Subset cisTopic object
cistopic_obj_noDBL = cistopic_obj.subset(singlets, copy=True)
print(cistopic_obj_noDBL)

In [None]:
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj_noDBL.pkl"), "wb")
)

In [None]:
#!wget https://github.com/mimno/Mallet/releases/download/v202108/Mallet-202108-bin.tar.gz
#!tar -xf Mallet-202108-bin.tar.gz

In [None]:
print(cistopic_obj)

In [None]:
# Remove no cell types
has_celltype = cistopic_obj.cell_data[~cistopic_obj.cell_data.celltype.isna()].index.tolist()
cistopic_obj = cistopic_obj.subset(cells=has_celltype, copy = True)

In [None]:
print(cistopic_obj)

In [None]:
os.environ['MALLET_MEMORY'] = '50G'
from pycisTopic.lda_models import run_cgs_models_mallet
# Configure path Mallet
mallet_path = str(Path(os.getenv("RESOURCES_PATH")) / "scenicplus/Mallet-202108/bin/mallet")
# Run models
models=run_cgs_models_mallet(
    cistopic_obj,
    n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    n_cpu=30,
    n_iter=500,
    random_state=555,
    alpha=50,
    alpha_by_topic=True,
    eta=0.1,
    eta_by_topic=False,
    tmp_path="/tmp",
    save_path=os.path.join(out_dir, "mallet_models"),
    mallet_path=mallet_path,
)

In [None]:
pickle.dump(
    models,
    open(os.path.join(out_dir, "models.pkl"), "wb")
)

In [None]:
from pycisTopic.lda_models import evaluate_models
model = evaluate_models(
    models,
    select_model = 40,
    return_model = True
)

In [None]:
cistopic_obj.add_LDA_model(model)

In [None]:
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)

In [None]:
from pycisTopic.clust_vis import (
    find_clusters,
    run_umap,
    run_tsne,
    plot_metadata,
    plot_topic,
    cell_topic_heatmap
)

In [None]:
find_clusters(
    cistopic_obj,
    target  = 'cell',
    k = 10,
    res = [0.6, 1.2, 3],
    prefix = 'pycisTopic_',
    scale = True)

In [None]:
run_umap(
    cistopic_obj,
    target  = 'cell', scale=True)

In [None]:
run_tsne(
    cistopic_obj,
    target  = 'cell', scale=True)

In [None]:
plot_metadata(
    cistopic_obj,
    reduction_name='UMAP',
    variables=['celltype', 'pycisTopic_leiden_10_0.6', 'pycisTopic_leiden_10_1.2', 'pycisTopic_leiden_10_3'],
    target='cell', num_columns=4,
    text_size=10,
    dot_size=5)

In [None]:
annot_dict = {}
for resolution in [0.6, 1.2, 3]:
    annot_dict[f"pycisTopic_leiden_10_{resolution}"] = {}
    for cluster in set(cistopic_obj.cell_data[f"pycisTopic_leiden_10_{resolution}"]):
        cells_in_cluster = cistopic_obj.cell_data.loc[
            cistopic_obj.cell_data[f"pycisTopic_leiden_10_{resolution}"] == cluster
        ]
        # Get celltype counts for cells that have annotations
        counts = cells_in_cluster.loc[cells_in_cluster.celltype.notna(), "celltype"].value_counts()
        
        if len(counts) > 0:
            # If there are cells with celltype annotations, use the most common one
            annot_dict[f"pycisTopic_leiden_10_{resolution}"][cluster] = f"{counts.index[0]}({cluster})"
        else:
            # If no cells have celltype annotations, just use the cluster number
            annot_dict[f"pycisTopic_leiden_10_{resolution}"][cluster] = f"Unknown({cluster})"

In [None]:
annot_dict

In [None]:
for resolution in [0.6, 1.2, 3]:
    cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'] = [
        annot_dict[f'pycisTopic_leiden_10_{resolution}'][x] for x in cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'].tolist()
    ]

In [None]:
plot_metadata(
    cistopic_obj,
    reduction_name='UMAP',
    variables=['celltype', 'pycisTopic_leiden_10_0.6', 'pycisTopic_leiden_10_1.2', 'pycisTopic_leiden_10_3'],
    target='cell', num_columns=4,
    text_size=10,
    dot_size=5)

In [None]:
plot_metadata(
    cistopic_obj,
    reduction_name='UMAP',
    variables=['log10_unique_fragments_count', 'tss_enrichment',  'fraction_of_fragments_in_peaks'], #'Doublet_scores_fragments',
    target='cell', num_columns=4,
    text_size=10,
    dot_size=5)

In [None]:
plot_topic(
    cistopic_obj,
    reduction_name = 'UMAP',
    target = 'cell',
    num_columns=5
)

In [None]:
cell_topic_heatmap(
    cistopic_obj,
    variables = ['celltype'],
    scale = False,
    legend_loc_x = 1.0,
    legend_loc_y = -1.2,
    legend_dist_y = -1,
    figsize = (10, 10)
)

In [None]:
from pycisTopic.topic_binarization import binarize_topics


In [None]:
region_bin_topics_top_3k = binarize_topics(
    cistopic_obj, method='ntop', ntop = 3_000,
    plot=True, num_columns=5
)

In [None]:
region_bin_topics_otsu = binarize_topics(
    cistopic_obj, method='otsu',
    plot=True, num_columns=5
)

In [None]:
binarized_cell_topic = binarize_topics(
    cistopic_obj,
    target='cell',
    method='li',
    plot=True,
    num_columns=5, nbins=100)

In [None]:
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
import matplotlib.pyplot as plt
from pycisTopic.utils import fig2img

In [None]:
topic_qc_metrics = compute_topic_metrics(cistopic_obj)

In [None]:
fig_dict={}
fig_dict['CoherenceVSAssignments']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Log10_Assignments', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['AssignmentsVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Log10_Assignments', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSRegions_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Regions_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSMarginal_dist']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Marginal_topic_dist', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSGini_index']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Gini_index', var_color='Gini_index', plot=False, return_fig=True)


In [None]:
# Plot topic stats in one figure
fig=plt.figure(figsize=(40, 43))
i = 1
for fig_ in fig_dict.keys():
    plt.subplot(2, 3, i)
    img = fig2img(fig_dict[fig_]) #To convert figures to png to plot together, see .utils.py. This converts the figure to png.
    plt.imshow(img)
    plt.axis('off')
    i += 1
plt.subplots_adjust(wspace=0, hspace=-0.70)
plt.show()

In [None]:
topic_annot = topic_annotation(
    cistopic_obj,
    annot_var='celltype',
    binarized_cell_topic=binarized_cell_topic,
    general_topic_thr = 0.2
)

In [None]:
topic_annot


In [None]:
from pycisTopic.diff_features import (
    impute_accessibility,
    normalize_scores,
    find_highly_variable_features,
    find_diff_features
)
import numpy as np

In [None]:
imputed_acc_obj = impute_accessibility(
    cistopic_obj,
    selected_cells=None,
    selected_regions=None,
    scale_factor=10**6
)

In [None]:
pickle.dump(
    imputed_acc_obj,
    open(os.path.join(out_dir, "imputed_acc_obj.pkl"), "wb")
)

In [None]:
import pickle
imputed_acc_obj = pickle.load(open(os.path.join(out_dir, "imputed_acc_obj.pkl"), "rb"))

In [None]:
normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4)


In [None]:
pickle.dump(
    normalized_imputed_acc_obj,
    open(os.path.join(out_dir, "normalized_imputed_acc_obj.pkl"), "wb")
)

In [None]:
variable_regions = find_highly_variable_features(
    normalized_imputed_acc_obj,
    min_disp = 0.05,
    min_mean = 0.0125,
    max_mean = 3,
    max_disp = np.inf,
    n_bins=20,
    n_top_features=None,
    plot=True
)

In [None]:
len(variable_regions)

In [None]:
markers_dict= find_diff_features(
    cistopic_obj,
    imputed_acc_obj,
    variable='celltype',
    var_features=variable_regions,
    contrasts=None,
    adjpval_thr=0.05,
    log2fc_thr=np.log2(1.5),
    n_cpu=5,
    _temp_dir='/tmp')


In [None]:
from pycisTopic.clust_vis import plot_imputed_features

In [None]:
markers_dict

In [None]:
plot_imputed_features(
    cistopic_obj,
    reduction_name='UMAP',
    imputed_data=imputed_acc_obj,
    features=[markers_dict[x].index.tolist()[0] for x in ['oogonia_STRA8']],
    scale=False,
    num_columns=4
)


In [None]:
print("Number of DARs found:")
print("---------------------")
for x in markers_dict:
    print(f"  {x}: {len(markers_dict[x])}")

In [None]:
os.makedirs(os.path.join(out_dir, "region_sets"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "region_sets", "Topics_otsu"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "region_sets", "Topics_top_3k"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "region_sets", "DARs_cell_type"), exist_ok = True)


In [None]:
from pycisTopic.utils import region_names_to_coordinates

In [None]:
for topic in region_bin_topics_otsu:
    region_names_to_coordinates(
        region_bin_topics_otsu[topic].index
    ).sort_values(
        ["Chromosome", "Start", "End"]
    ).to_csv(
        os.path.join(out_dir, "region_sets", "Topics_otsu", f"{topic}.bed"),
        sep = "\t",
        header = False, index = False
    )


In [None]:
for topic in region_bin_topics_top_3k:
    region_names_to_coordinates(
        region_bin_topics_top_3k[topic].index
    ).sort_values(
        ["Chromosome", "Start", "End"]
    ).to_csv(
        os.path.join(out_dir, "region_sets", "Topics_top_3k", f"{topic}.bed"),
        sep = "\t",
        header = False, index = False
    )

In [None]:
for cell_type in markers_dict:
    region_names_to_coordinates(
        markers_dict[cell_type].index
    ).sort_values(
        ["Chromosome", "Start", "End"]
    ).to_csv(
        os.path.join(out_dir, "region_sets", "DARs_cell_type", f"{cell_type}.bed"),
        sep = "\t",
        header = False, index = False
    )