In [1]:
import os
import sys
import tempfile
import shutil
from ast import literal_eval
from tqdm import tqdm

import anndata
from ete3 import Tree
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy

sys.path.insert(0, '/lab/solexa_weissman/kmin/git/KPTracer-release/cassiopeia-kp')
sys.path.append('/lab/solexa_weissman/kmin/kp_infercnv/NT')
from utilities import plot_tree_itol
from utilities import clonal_expansions

In [2]:
apiKey = ''
projectName = 'KP_Trees_Joseph' ## Specify the project name
plot_dir = 'plots'

In [3]:
target = '3730_NT_T2'

In [4]:
# Read chromosome sizes = 0 to end of last gene
df_genes = pd.read_csv('../mm10_gene_ordering_reordered.tsv', names=['gene', 'chr', 'start', 'end'], sep='\t', index_col=0)
chromosome_sizes = dict(df_genes.groupby('chr')['end'].max())
# Matplotlib can't plot very large images correctly, so we have to bin
# https://github.com/matplotlib/matplotlib/issues/19276
gene_positions = {chrom: {gene: (row['start']-1, row['end']) for gene, row in df_chrom.iterrows()} for chrom, df_chrom in df_genes.groupby('chr')}

In [5]:
adata = anndata.read('/lab/solexa_weissman/mgjones/projects/kptc/RNA/NT/adata_processed.filtered.subcluster.nt.h5ad')

In [7]:
temp_dir = tempfile.mkdtemp()
threshold = '0.2'
out_dir = f'{target}_grouped{"_0.2" if threshold == "0.2" else ""}'

# Load CNV genes and regions
df_counts = pd.read_csv(f'{out_dir}/infercnv.18_HMM_pred.Bayes_Net.Pnorm_{threshold}.observations.txt', sep=' ').T

# Load Cassiopeia tree
df_model = pd.read_csv('/lab/solexa_weissman/mgjones/projects/kptc/trees/tumor_model.txt', sep='\t', index_col=0)
tree_cass = Tree(f'/lab/solexa_weissman/mgjones/projects/kptc/trees/{target}/{df_model.loc[target]["Newick"]}', format=1)

# Cells present in both trees
cells = set(df_counts.index).intersection(
    leaf.name for leaf in tree_cass.iter_leaves()
)

# Subset CNV counts to only cells in the tree
df_counts = df_counts[df_counts.index.isin(cells)]

# Calculate linkage for hierarchical clustering
linkage = hierarchy.linkage(df_counts.values, method='ward', metric='euclidean')

# Leiden
files = []
color_map = {str(i): adata.uns['leiden_sub_colors'][i] for i in range(len(adata.obs['leiden_sub'].unique()))}
path = os.path.join(temp_dir, 'leiden.txt')
header = f'DATASET_COLORSTRIP\nSEPARATOR TAB\nCOLOR\t#000000\nDATASET_LABEL\tleiden\nSTRIP_WIDTH\t80\nMARGIN\t20\nSHOW_INTERNAL\t0\nDATA\n\n'
with open(path, 'w') as f:
    f.write(header)
    for cell in tree_cass.get_leaf_names():
        if cell in adata.obs.index:
            f.write(f'{cell}\t{color_map[adata.obs.loc[cell]["leiden"]]}\n')
        else:
            f.write(f'{cell}\t#FFFFFF\n')
#     files.append(path)

cmap = mpl.cm.tab10
color_map = {i+1: f'rgb({",".join(str(int(c * 255)) for c in cmap(i / 6)[:-1])})' for i in range(cmap.N)}

# Generate clustering at each branching point
for t in sorted(np.unique(linkage[:,2])):
    labels = hierarchy.fcluster(linkage, criterion='distance', t=t)
    df_counts['labels'] = labels

    # Skip if there is only one label
    n_clusters = len(np.unique(labels))
    if n_clusters < 2:
        continue

    name = f'cluster_{n_clusters}'
    header = f'DATASET_COLORSTRIP\nSEPARATOR TAB\nCOLOR\t#000000\nDATASET_LABEL\t{name}\nSTRIP_WIDTH\t50\nMARGIN\t20\nSHOW_INTERNAL\t0\nDATA\n\n'
    path = os.path.join(temp_dir, f'{name}.txt')
    with open(path, 'w') as f:
        f.write(header)
        f.write(df_counts['labels'].map(color_map).to_csv(sep='\t', header=None))
    files.append(path)

plot_tree_itol.upload_to_itol(
    tree_cass,
    apiKey,
    projectName,
    target,
    files=files,
    outfp=os.path.join(plot_dir, f'{target}_colorstrip.pdf'),
    rect=False,
    line_width=8,
)

shutil.rmtree(temp_dir, ignore_errors=True)

iTOL output: SUCCESS: 1841168150151639432358

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168150151639432358&restore_saved=1


In [8]:
from IPython.display import IFrame
IFrame(os.path.join(plot_dir, f'{target}_colorstrip.pdf'), width=500, height=500)

In [10]:
# Per partition
df_model = pd.read_csv('/lab/solexa_weissman/mattjones/projects/kptc/trees/tumor_model.txt', sep='\t', index_col=0)
print(target)
temp_dir = tempfile.mkdtemp()
threshold = '0.2'
out_dir = f'{target}_grouped{"_0.2" if threshold == "0.2" else ""}'

# Load Cassiopeia tree
tree_cass = Tree(f'/lab/solexa_weissman/mattjones/projects/kptc/trees/{target}/{df_model.loc[target]["Newick"]}', format=1)
leaves = tree_cass.get_leaf_names()

# Load CNV genes and regions
df_regions = pd.read_csv(f'{out_dir}/HMM_CNV_predictions.HMMi6.rand_trees.hmm_mode-subclusters.Pnorm_0.2.pred_cnv_regions.dat', sep='\t')
df_cells = pd.read_csv(f'{out_dir}/17_HMM_predHMMi6.rand_trees.hmm_mode-subclusters.cell_groupings', sep='\t')
df_merged = pd.merge(df_regions, df_cells[~df_cells['cell_group_name'].str.contains('Normal')], on='cell_group_name')

gene_states = {}
for cell, chrom, start, end, state in df_merged[['cell', 'chr', 'start', 'end', 'state']].values:
    interval_genes = []
    for gene, (gene_start, gene_end) in gene_positions[chrom].items():
        if start - 1 <= gene_start and end >= gene_end:
            gene_states.setdefault(cell, {})[gene] = state
for cell in leaves:
    if cell not in gene_states:
        gene_states[cell] = {'placeholder': 3}
df_counts = pd.DataFrame.from_dict(gene_states, orient='index').fillna(3).astype(int)
if 'placeholder' in df_counts.columns:
    df_counts.drop(columns='placeholder', inplace=True)

# Subset CNV counts to only cells in the tree
df_counts = df_counts[df_counts.index.isin(leaves)]

# Calculate linkage for hierarchical clustering
linkage = hierarchy.linkage(df_counts.values, method='ward', metric='euclidean')

# Generate clustering at each branching point
for t in sorted(np.unique(linkage[:,2])):
    labels = hierarchy.fcluster(linkage, criterion='distance', t=t)
    df_counts['labels'] = labels

    # Skip if there is only one label
    n_clusters = len(np.unique(labels))
    if n_clusters < 2:
        continue

    files = []
    # Leiden
    color_map = {str(i): adata.uns['leiden_sub_colors'][i] for i in range(len(adata.obs['leiden_sub'].unique()))}
    path = os.path.join(temp_dir, 'leiden.txt')
    header = f'DATASET_COLORSTRIP\nSEPARATOR TAB\nCOLOR\t#000000\nDATASET_LABEL\tleiden\nSTRIP_WIDTH\t80\nMARGIN\t20\nSHOW_INTERNAL\t0\nDATA\n\n'
    with open(path, 'w') as f:
        f.write(header)
        for cell in tree_cass.get_leaf_names():
            if cell in adata.obs.index:
                f.write(f'{cell}\t{color_map[adata.obs.loc[cell]["leiden"]]}\n')
            else:
                f.write(f'{cell}\t#FFFFFF\n')
#         files.append(path)

    cmap = mpl.cm.tab10
    color_map = {i+1: f'rgb({",".join(str(int(c * 255)) for c in cmap(i / 6)[:-1])})' for i in range(cmap.N)}
    name = f'cluster_{n_clusters}'
    header = f'DATASET_COLORSTRIP\nSEPARATOR TAB\nCOLOR\t#000000\nDATASET_LABEL\t{name}\nSTRIP_WIDTH\t80\nMARGIN\t20\nSHOW_INTERNAL\t0\nDATA\n\n'
    path = os.path.join(temp_dir, f'{name}.txt')
    with open(path, 'w') as f:
        f.write(header)
        f.write(df_counts['labels'].map(color_map).to_csv(sep='\t', header=None))
    files.append(path)

    plot_tree_itol.upload_to_itol(
        tree_cass,
        apiKey,
        projectName,
        target,
        files=files,
        outfp=os.path.join(plot_dir, f'{target}_colorstrip_{n_clusters}.pdf'),
        rect=False,
        line_width=8,
    )

shutil.rmtree(temp_dir, ignore_errors=True)

3724_NT_T1
iTOL output: SUCCESS: 1841168281141611852113

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168281141611852113&restore_saved=1
iTOL output: SUCCESS: 1841168283631611852175

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168283631611852175&restore_saved=1
iTOL output: SUCCESS: 1841168287261611852237

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168287261611852237&restore_saved=1
iTOL output: SUCCESS: 1841168290191611852303

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168290191611852303&restore_saved=1
iTOL output: SUCCESS: 1841168292261611852365

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168292261611852365&restore_saved=1
iTOL output: SUCCESS: 1841168295331611852428

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168295331611852428&restore_saved=1
iTOL output: SUCCESS: 1841168297331611852490

Tree Web Page URL: http://itol.embl.de/external.cgi?tree=1841168297331611852490&restore_sav