In [1]:
import dendropy, os, subprocess, shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
subtree_inputs = pd.read_csv(
    '240816to21_results.passfilter.subtree.csv', index_col=0)

subtree_list = np.unique(
    subtree_inputs.loc[:, 'subtree'])
subtree_list = subtree_list[subtree_list != 'NONE']

In [14]:
np.sum(subtree_inputs.subtree != 'NONE', axis=0)

55259

In [3]:
background_strains = []
np.random.seed(42)
for subtree_name in subtree_list:
    # randomly select 20 strains for background
    strains = np.random.choice(
        subtree_inputs.loc[subtree_inputs.subtree == subtree_name].index, size=20, replace=False)
    background_strains.append(
        pd.DataFrame(
            data=[strains, [subtree_name for i in strains]],
            index=['strain', 'subtree']).T)
background_strains = pd.concat(background_strains, axis=0).reset_index()

## Subtree generation

Generate fastas

In [4]:
work_dir = 'scratch/supertree_construction/'
os.makedirs(work_dir, exist_ok=True)

for subtree_name in subtree_list:
    subtree_strains = subtree_inputs.loc[subtree_inputs.subtree == subtree_name].index
    # collect paths (+ canettii outgroup)
    subtree_data = [[
        'canettii',
        '240805_qingyun_sra/SRR10522783/results/SRR10522783.breseq.vcf',
        '240805_qingyun_sra/SRR10522783/results/SRR10522783.miss.breseq.zarr']]
    # combine subtree strains with background strains
    subtree_input = np.unique(np.concatenate([subtree_strains, background_strains.strain]))
    for index in subtree_input:
        row_data = [index,
             f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.breseq.vcf',
             f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.miss.breseq.zarr']
        subtree_data.append(row_data)
    subtree_df = pd.DataFrame(
        data=subtree_data,
        columns=['label', 'vcf_path', 'miss_path'])
    # write df to working directory, but remove anything that's already there
    if os.path.exists(f'{work_dir}/{subtree_name}'):
        shutil.rmtree(f'{work_dir}/{subtree_name}')
    os.makedirs(f'{work_dir}/{subtree_name}', exist_ok=True)
    subtree_df.to_csv(f'{work_dir}/{subtree_name}/subtree_inputs.csv')
    # run fasta generation command for subtree
    cmd = f'\
        sbatch -c 40 -p sapphire -t 2:00:00 --mem=80G -o {work_dir}/{subtree_name}/fasta-%j.out --wrap="\
        module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates && \
        export PATH=\"/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH\" && \
        export PYTHONPATH=\"${{PYTHONPATH}}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools\" && \
        write_snp_fastas.py \
        --input-csv {work_dir}/{subtree_name}/subtree_inputs.csv \
        --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta \
        --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/ \
        --out-dir {work_dir}/{subtree_name}/ \
        --output {subtree_name} \
        --mask rlc_plus_lowmap_marin.bed \
        --miss-threshold 0.5 \
        --local-threads 40"'
    subprocess.run(cmd, shell=True)

Submitted batch job 46565495
Submitted batch job 46565496
Submitted batch job 46565497
Submitted batch job 46565498
Submitted batch job 46565509
Submitted batch job 46565510
Submitted batch job 46565511
Submitted batch job 46565512
Submitted batch job 46565513
Submitted batch job 46565514
Submitted batch job 46565515
Submitted batch job 46565516
Submitted batch job 46565518
Submitted batch job 46565519


Construct rawdist trees

In [5]:
# collect fasta generation stats
fasta_stats = []
for subtree_name in subtree_list:
    fasta_stats.append(pd.read_csv(f'{work_dir}/{subtree_name}/{subtree_name}.results.csv', index_col=0).T)
fasta_stats = pd.concat(fasta_stats, axis=0)
# estimate memory use
fasta_stats.loc[:, 'gb_memory'] = np.ceil(5 + 21 * fasta_stats.n_samples * fasta_stats.passing_output_sites / 1e+9 * 1.25).astype(int)
fasta_stats

Unnamed: 0,n_samples,genome_len,miss_threshold,pass_miss_threshold_sites,variant_sites,minimum_variant_strains_to_consider,considered_variant_sites,bed_mask_sites,passing_invariant_sites,passing_variant_sites,passing_output_sites,gb_memory
L1.1,2608.0,4411532.0,0.5,4318060.0,240888.0,1.0,240888.0,276750.0,3894806.0,227578.0,227578.0,21
L1.2,2034.0,4411532.0,0.5,4336432.0,163309.0,1.0,163309.0,276750.0,3976709.0,153275.0,153275.0,14
L2.1,390.0,4411532.0,0.5,4318472.0,64422.0,1.0,64422.0,276750.0,4069288.0,59716.0,59716.0,6
L2.2,16402.0,4411532.0,0.5,4313316.0,258399.0,1.0,258399.0,276750.0,3877983.0,243980.0,243980.0,111
L3,6313.0,4411532.0,0.5,4336143.0,306790.0,1.0,306790.0,276750.0,3841065.0,289219.0,289219.0,53
L4.1,9360.0,4411532.0,0.5,4323490.0,276656.0,1.0,276656.0,276750.0,3859384.0,260003.0,260003.0,69
L4.2,2783.0,4411532.0,0.5,4342269.0,104060.0,1.0,104060.0,276750.0,4037655.0,96993.0,96993.0,13
L4.3,8499.0,4411532.0,0.5,4324996.0,203742.0,1.0,203742.0,276750.0,3939635.0,191514.0,191514.0,48
L4.4,1869.0,4411532.0,0.5,4328080.0,104322.0,1.0,104322.0,276750.0,4028571.0,97138.0,97138.0,10
L4.5,872.0,4411532.0,0.5,4335074.0,98057.0,1.0,98057.0,276750.0,4036778.0,91279.0,91279.0,8


In [6]:
threads = 50
for subtree_name, subtree_data in fasta_stats.iterrows():
    cmd = f'\
        sbatch -c {threads} -p sapphire -t {min(72, subtree_data.gb_memory.astype(int))}:00:00 --mem={subtree_data.gb_memory.astype(int)}G -o {work_dir}/{subtree_name}/tree-%j.out --wrap="\
        export OMP_NUM_THREADS={threads} && \
        FastTreeMP -rawdist -nt {work_dir}/{subtree_name}/{subtree_name}.fasta > {work_dir}/{subtree_name}/{subtree_name}.nwk"'
    subprocess.run(cmd, shell=True)

Submitted batch job 46567048
Submitted batch job 46567049
Submitted batch job 46567050
Submitted batch job 46567051
Submitted batch job 46567052
Submitted batch job 46567053
Submitted batch job 46567054
Submitted batch job 46567055
Submitted batch job 46567056
Submitted batch job 46567057
Submitted batch job 46567058
Submitted batch job 46567059
Submitted batch job 46567060
Submitted batch job 46567061


## Normalize Subtrees

Normalize, root and ladderize subtrees.

In [5]:
work_dir = 'scratch/supertree_construction/'
# collect fasta generation stats
fasta_stats = []
for subtree_name in subtree_list:
    fasta_stats.append(pd.read_csv(f'{work_dir}/{subtree_name}/{subtree_name}.results.csv', index_col=0).T)
fasta_stats = pd.concat(fasta_stats, axis=0)
fasta_stats

Unnamed: 0,n_samples,genome_len,miss_threshold,pass_miss_threshold_sites,variant_sites,minimum_variant_strains_to_consider,considered_variant_sites,bed_mask_sites,passing_invariant_sites,passing_variant_sites,passing_output_sites
L1.1,2608.0,4411532.0,0.5,4318060.0,240888.0,1.0,240888.0,276750.0,3894806.0,227578.0,227578.0
L1.2,2034.0,4411532.0,0.5,4336432.0,163309.0,1.0,163309.0,276750.0,3976709.0,153275.0,153275.0
L2.1,390.0,4411532.0,0.5,4318472.0,64422.0,1.0,64422.0,276750.0,4069288.0,59716.0,59716.0
L2.2,16402.0,4411532.0,0.5,4313316.0,258399.0,1.0,258399.0,276750.0,3877983.0,243980.0,243980.0
L3,6313.0,4411532.0,0.5,4336143.0,306790.0,1.0,306790.0,276750.0,3841065.0,289219.0,289219.0
L4.1,9360.0,4411532.0,0.5,4323490.0,276656.0,1.0,276656.0,276750.0,3859384.0,260003.0,260003.0
L4.2,2783.0,4411532.0,0.5,4342269.0,104060.0,1.0,104060.0,276750.0,4037655.0,96993.0,96993.0
L4.3,8499.0,4411532.0,0.5,4324996.0,203742.0,1.0,203742.0,276750.0,3939635.0,191514.0,191514.0
L4.4,1869.0,4411532.0,0.5,4328080.0,104322.0,1.0,104322.0,276750.0,4028571.0,97138.0,97138.0
L4.5,872.0,4411532.0,0.5,4335074.0,98057.0,1.0,98057.0,276750.0,4036778.0,91279.0,91279.0


In [6]:
for subtree_name, subtree_data in fasta_stats.iterrows():
    # load the tree
    tree = dendropy.Tree.get_from_path(
        f'{work_dir}/{subtree_name}/{subtree_name}.nwk', 'newick')
    # root at canettii
    tree.reroot_at_node(
        tree.find_node_with_taxon_label('canettii'))
    tree.prune_taxa_with_labels('canettii')
    # ladderize
    tree.ladderize()
    # rescale edges
    for edge in tree.edges():
        if edge.length != None:
            edge.length = edge.length * (subtree_data.passing_output_sites / subtree_data.genome_len)
    tree.write(
        path=f'{work_dir}/{subtree_name}/{subtree_name}.norm.nwk',
        schema='newick')
    # finally, trim out background strains
    tree.prune_taxa_with_labels(background_strains.strain.loc[background_strains.subtree != subtree_name].values)
    tree.write(
        path=f'{work_dir}/{subtree_name}/{subtree_name}.prune.nwk',
        schema='newick')

## Generate scaffold tree

Load in pared subtrees and pull strains

In [4]:
target_level = 6
work_dir = 'scratch/supertree_construction/'

def getLeafSampling(tree, target_level, seed=None):
    # recursive function to get to level
    def stepToLevel(target_node, target_level, current_level=0):
        if current_level == target_level:
            return [target_node]
        elif target_node.is_leaf():
            return [target_node]
        else:
            return np.concatenate(
                [stepToLevel(child, target_level, current_level + 1)
                 for child in target_node.child_nodes()])
    np.random.seed(seed)
    selected_leaves = []
    for node in stepToLevel(tree.seed_node, target_level):
        if node.is_leaf():
            selected_leaves.append(node.taxon.label)
        else:
            selected_leaves.append(
                np.random.choice(node.leaf_nodes()).taxon.label)
    return selected_leaves

# select supertree leaves
seed = 42
scaffold_data = []
for subtree_name in subtree_list:
    loaded_subtree = dendropy.Tree.get_from_path(
        f'{work_dir}/{subtree_name}/{subtree_name}.prune.nwk','newick')
    target_leaves = getLeafSampling(loaded_subtree, target_level, seed=seed)
    scaffold_data.append(pd.DataFrame(
        data=[target_leaves, [subtree_name for i in target_leaves]],
        index=['strain', 'subtree']).T)
scaffold_strains = pd.concat(scaffold_data, axis=0)
scaffold_strains

Unnamed: 0,strain,subtree
0,SAMEA7525868,L1.1
1,SAMEA7527532,L1.1
2,SAMEA5607347,L1.1
3,SAMN36619493,L1.1
4,SAMEA7526267,L1.1
...,...,...
20,SAMN13568144,L4.9
21,SAMEA112800064,L4.9
22,SAMN09090410,L4.9
23,SAMEA112799821,L4.9


Generate fasta

In [16]:
# collect scaffold data
scaffold_data = [[
    'canettii',
    '240805_qingyun_sra/SRR10522783/results/SRR10522783.breseq.vcf',
    '240805_qingyun_sra/SRR10522783/results/SRR10522783.miss.breseq.zarr']]
for index in np.unique(np.concatenate([scaffold_strains.strain.values, background_strains.strain.values])):
    row_data = [index,
         f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.breseq.vcf',
         f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.miss.breseq.zarr']
    scaffold_data.append(row_data)
scaffold_df = pd.DataFrame(
    data=scaffold_data,
    columns=['label', 'vcf_path', 'miss_path'])

# write df to working directory, but remove anything that's already there
if os.path.exists(f'{work_dir}/scaffold_1'):
    shutil.rmtree(f'{work_dir}/scaffold_1')
os.makedirs(f'{work_dir}/scaffold_1', exist_ok=True)
scaffold_df.to_csv(f'{work_dir}/scaffold_1/scaffold_inputs.csv')
# run fasta generation command for subtree
cmd = f'\
    sbatch -c 24 -p sapphire -t 1:00:00 --mem=50G -o {work_dir}/scaffold_1/fasta-%j.out --wrap="\
    module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates && \
    export PATH=\"/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH\" && \
    export PYTHONPATH=\"${{PYTHONPATH}}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools\" && \
    write_snp_fastas.py \
    --input-csv {work_dir}/scaffold_1/scaffold_inputs.csv \
    --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta \
    --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/ \
    --out-dir {work_dir}/scaffold_1/ \
    --output scaffold \
    --mask rlc_plus_lowmap_marin.bed \
    --miss-threshold 0.5 \
    --local-threads 10"'
subprocess.run(cmd, shell=True)

Submitted batch job 46868504


CompletedProcess(args='    sbatch -c 24 -p sapphire -t 1:00:00 --mem=50G -o scratch/supertree_construction//scaffold_1/fasta-%j.out --wrap="    module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates &&     export PATH="/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH" &&     export PYTHONPATH="${PYTHONPATH}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools" &&     write_snp_fastas.py     --input-csv scratch/supertree_construction//scaffold_1/scaffold_inputs.csv     --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta     --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/     --out-dir scratch/supertree_construction//scaffold_1/     --output scaffold     --mask rlc_plus_lowmap_marin.bed     --miss-threshold 0.5     --local-threads 10"', returncode=0)

In [5]:
fasta_data = pd.read_csv(f'{work_dir}/scaffold_1/scaffold.results.csv', index_col=0).T.loc['scaffold']
fasta_data.loc['gb_memory'] = np.ceil(5 + 21*fasta_data.n_samples * fasta_data.passing_output_sites / 1e+9).astype(int)
fasta_data

n_samples                                  785.0
genome_len                             4411532.0
miss_threshold                               0.5
pass_miss_threshold_sites              4339859.0
variant_sites                           129450.0
minimum_variant_strains_to_consider          1.0
considered_variant_sites                129450.0
bed_mask_sites                          276750.0
passing_invariant_sites                4013003.0
passing_variant_sites                   121243.0
passing_output_sites                    121243.0
gb_memory                                    7.0
Name: scaffold, dtype: float64

In [19]:
threads = 24
cmd = f'\
    sbatch -c {threads} -p sapphire -t 1:00:00 --mem={fasta_data.gb_memory.astype(int)}G -o {work_dir}/scaffold_1/tree-%j.out --wrap="\
    export OMP_NUM_THREADS={threads} && \
    FastTreeMP -rawdist -nt {work_dir}/scaffold_1/scaffold.fasta > {work_dir}/scaffold_1/scaffold.nwk"'
subprocess.run(cmd, shell=True)

Submitted batch job 46868540


CompletedProcess(args='    sbatch -c 24 -p sapphire -t 1:00:00 --mem=7G -o scratch/supertree_construction//scaffold_1/tree-%j.out --wrap="    export OMP_NUM_THREADS=24 &&     FastTreeMP -rawdist -nt scratch/supertree_construction//scaffold_1/scaffold.fasta > scratch/supertree_construction//scaffold_1/scaffold.nwk"', returncode=0)

In [6]:
# load the tree
tree = dendropy.Tree.get_from_path(
    f'{work_dir}/scaffold_1/scaffold.nwk', 'newick')

# root at canettii parent
tree.reroot_at_node(
    tree.find_node_with_taxon_label('canettii').parent_node)

# ladderize
tree.ladderize()
# rescale edges
for edge in tree.edges():
    if edge.length != None:
        edge.length = edge.length * (fasta_data.passing_output_sites / fasta_data.genome_len)
tree.write(
    path=f'{work_dir}/scaffold_1/scaffold.outgroup.nwk',
    schema='newick')

Check for incorrect supertree assignments.

In [7]:
for subtree in subtree_list:
    mrca_node = tree.mrca(taxon_labels=scaffold_strains.strain[scaffold_strains.subtree == subtree].values)
    mrca_leaves = [n.taxon.label for n in mrca_node.leaf_nodes()]
    subtree_mrca_child_nodes = np.unique(np.concatenate([
        scaffold_strains.subtree[np.isin(scaffold_strains.strain, mrca_leaves)].values,
        background_strains.subtree[np.isin(background_strains.strain, mrca_leaves)].values]))
    print(f'{subtree}: {subtree_mrca_child_nodes}')

L1.1: ['L1.1']
L1.2: ['L1.2']
L2.1: ['L2.1']
L2.2: ['L2.2']
L3: ['L3']
L4.1: ['L4.1']
L4.2: ['L4.2']
L4.3: ['L4.3']
L4.4: ['L4.4']
L4.5: ['L4.5']
L4.6: ['L4.6']
L4.7: ['L4.7']
L4.8: ['L4.8']
L4.9: ['L4.9']


Paste together supertree from scaffold (pruned)

In [8]:
# first remove scaffold branches
scaffold_tree = dendropy.Tree.get_from_path(
    f'{work_dir}/scaffold_1/scaffold.outgroup.nwk', 'newick')
node_lookups = {}
for subtree_name in subtree_list:
    mrca_node = scaffold_tree.mrca(taxon_labels=scaffold_strains.strain[scaffold_strains.subtree == subtree_name].values)
    # remove mrca child nodes
    for child_node in mrca_node.child_nodes():
        mrca_node.remove_child(child_node)
    node_lookups[subtree_name] = mrca_node

# verify number of nodes
print(len(node_lookups), len(subtree_list))

# now paste on subtrees to scaffold
for subtree_name in subtree_list:
    mrca_node = node_lookups[subtree_name]
    # load subtree
    loaded_subtree = dendropy.Tree.get_from_path(
        f'{work_dir}/{subtree_name}/{subtree_name}.prune.nwk', 'newick')
    for child_node in loaded_subtree.seed_node.child_nodes():
        mrca_node.add_child(child_node)

scaffold_tree.ladderize()
scaffold_tree.write(
    path=f'{work_dir}/scaffold_1/reconstructed_supertree.nwk',
    schema='newick')

14 14


# Tree Generation Testing

## Generate scaffold tree (2)

Load in pared subtrees and pull strains

In [8]:
target_level = 6
work_dir = 'scratch/supertree_construction/'

def getLeafSampling(tree, target_level, seed=None):
    # recursive function to get to level
    def stepToLevel(target_node, target_level, current_level=0):
        if current_level == target_level:
            return [target_node]
        elif target_node.is_leaf():
            return [target_node]
        else:
            return np.concatenate(
                [stepToLevel(child, target_level, current_level + 1)
                 for child in target_node.child_nodes()])
    np.random.seed(seed)
    selected_leaves = []
    for node in stepToLevel(tree.seed_node, target_level):
        if node.is_leaf():
            selected_leaves.append(node.taxon.label)
        else:
            selected_leaves.append(
                np.random.choice(node.leaf_nodes()).taxon.label)
    return selected_leaves

# select supertree leaves
seed = 42
scaffold_data = []
for subtree_name in subtree_list:
    loaded_subtree = dendropy.Tree.get_from_path(
        f'{work_dir}/{subtree_name}/{subtree_name}.prune.nwk','newick')
    target_leaves = getLeafSampling(loaded_subtree, target_level, seed=seed)
    scaffold_data.append(pd.DataFrame(
        data=[target_leaves, [subtree_name for i in target_leaves]],
        index=['strain', 'subtree']).T)
scaffold_strains = pd.concat(scaffold_data, axis=0)
scaffold_strains

Unnamed: 0,strain,subtree
0,SAMEA7525868,L1.1
1,SAMEA7528466,L1.1
2,SAMEA11225748,L1.1
3,SAMN36619493,L1.1
4,SAMEA11225683,L1.1
...,...,...
20,SAMN13568144,L4.9
21,SAMEA112800064,L4.9
22,SAMEA1101738,L4.9
23,SAMEA112799821,L4.9


Generate fasta

In [10]:
# collect scaffold data
scaffold_data = [[
    'canettii',
    '240805_qingyun_sra/SRR10522783/results/SRR10522783.breseq.vcf',
    '240805_qingyun_sra/SRR10522783/results/SRR10522783.miss.breseq.zarr']]
for index in np.unique(np.concatenate([scaffold_strains.strain.values, background_strains.strain.values])):
    row_data = [index,
         f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.breseq.vcf',
         f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.miss.breseq.zarr']
    scaffold_data.append(row_data)
scaffold_df = pd.DataFrame(
    data=scaffold_data,
    columns=['label', 'vcf_path', 'miss_path'])

# write df to working directory, but remove anything that's already there
if os.path.exists(f'{work_dir}/scaffold_1'):
    shutil.rmtree(f'{work_dir}/scaffold_1')
os.makedirs(f'{work_dir}/scaffold_1', exist_ok=True)
scaffold_df.to_csv(f'{work_dir}/scaffold_1/scaffold_inputs.csv')
# run fasta generation command for subtree
cmd = f'\
    sbatch -c 24 -p sapphire -t 1:00:00 --mem=50G -o {work_dir}/scaffold_1/fasta-%j.out --wrap="\
    module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates && \
    export PATH=\"/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH\" && \
    export PYTHONPATH=\"${{PYTHONPATH}}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools\" && \
    write_snp_fastas.py \
    --input-csv {work_dir}/scaffold_1/scaffold_inputs.csv \
    --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta \
    --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/ \
    --out-dir {work_dir}/scaffold_1/ \
    --output scaffold \
    --mask rlc_plus_lowmap_marin.bed \
    --miss-threshold 0.5 \
    --local-threads 10"'
subprocess.run(cmd, shell=True)

Submitted batch job 46867813


CompletedProcess(args='    sbatch -c 24 -p sapphire -t 1:00:00 --mem=50G -o scratch/supertree_construction//scaffold_1/fasta-%j.out --wrap="    module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates &&     export PATH="/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH" &&     export PYTHONPATH="${PYTHONPATH}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools" &&     write_snp_fastas.py     --input-csv scratch/supertree_construction//scaffold_1/scaffold_inputs.csv     --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta     --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/     --out-dir scratch/supertree_construction//scaffold_1/     --output scaffold     --mask rlc_plus_lowmap_marin.bed     --miss-threshold 0.5     --local-threads 10"', returncode=0)

In [11]:
fasta_data = pd.read_csv(f'{work_dir}/scaffold_1/scaffold.results.csv', index_col=0).T.loc['scaffold']
fasta_data.loc['gb_memory'] = np.ceil(5 + 21*fasta_data.n_samples * fasta_data.passing_output_sites / 1e+9).astype(int)
fasta_data

n_samples                                  787.0
genome_len                             4411532.0
miss_threshold                               0.5
pass_miss_threshold_sites              4339863.0
variant_sites                           129150.0
minimum_variant_strains_to_consider          1.0
considered_variant_sites                129150.0
bed_mask_sites                          276750.0
passing_invariant_sites                4013258.0
passing_variant_sites                   120952.0
passing_output_sites                    120952.0
gb_memory                                    7.0
Name: scaffold, dtype: float64

In [12]:
threads = 24
cmd = f'\
    sbatch -c {threads} -p sapphire -t 1:00:00 --mem={fasta_data.gb_memory.astype(int)}G -o {work_dir}/scaffold_1/tree-%j.out --wrap="\
    export OMP_NUM_THREADS={threads} && \
    FastTreeMP -rawdist -nt {work_dir}/scaffold_1/scaffold.fasta > {work_dir}/scaffold_1/scaffold.nwk"'
subprocess.run(cmd, shell=True)

Submitted batch job 46867848


CompletedProcess(args='    sbatch -c 24 -p sapphire -t 1:00:00 --mem=7G -o scratch/supertree_construction//scaffold_1/tree-%j.out --wrap="    export OMP_NUM_THREADS=24 &&     FastTreeMP -rawdist -nt scratch/supertree_construction//scaffold_1/scaffold.fasta > scratch/supertree_construction//scaffold_1/scaffold.nwk"', returncode=0)

Normalize the tree.

In [7]:
# load the tree
tree = dendropy.Tree.get_from_path(
    f'{work_dir}/scaffold_1/scaffold.nwk', 'newick')
# root at canettii
tree.reroot_at_node(
    tree.find_node_with_taxon_label('canettii'))
tree.prune_taxa_with_labels('canettii')
# ladderize
tree.ladderize()
# rescale edges
for edge in tree.edges():
    if edge.length != None:
        edge.length = edge.length * (fasta_data.passing_output_sites / fasta_data.genome_len)
tree.write(
    path=f'{work_dir}/scaffold_1/scaffold.norm.nwk',
    schema='newick')

### Generate a complete supertree

In [65]:
supertree_data = [[
    'canettii',
    '240805_qingyun_sra/SRR10522783/results/SRR10522783.breseq.vcf',
    '240805_qingyun_sra/SRR10522783/results/SRR10522783.miss.breseq.zarr']]
for index in all_leaves:
    row_data = [index,
         f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.breseq.vcf',
         f'{subtree_inputs.loc[index, "batch"]}/{index}/results/{index}.miss.breseq.zarr']
    supertree_data.append(row_data)
supertree_df = pd.DataFrame(
    data=supertree_data,
    columns=['label', 'vcf_path', 'miss_path'])
# write df to working directory, but remove anything that's already there
if os.path.exists(f'{work_dir}/supertree'):
    shutil.rmtree(f'{work_dir}/supertree')
os.makedirs(f'{work_dir}/supertree', exist_ok=True)
supertree_df.to_csv(f'{work_dir}/supertree/supertree_inputs.csv')
# run fasta generation command for subtree
cmd = f'\
    sbatch -c 10 -p shared -t 1:00:00 --mem=20G -o {work_dir}/supertree/fasta-%j.out --wrap="\
    module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates && \
    export PATH=\"/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH\" && \
    export PYTHONPATH=\"${{PYTHONPATH}}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools\" && \
    write_snp_fastas.py \
    --input-csv {work_dir}/supertree/supertree_inputs.csv \
    --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta \
    --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/ \
    --out-dir {work_dir}/supertree/ \
    --output supertree \
    --mask rlc_plus_lowmap_marin.bed \
    --miss-threshold 0.5 \
    --local-threads 10"'
subprocess.run(cmd, shell=True)

Submitted batch job 46448454


CompletedProcess(args='    sbatch -c 10 -p shared -t 1:00:00 --mem=20G -o scratch/subtree_test//L4.9/fasta-%j.out --wrap="    module load Mambaforge/22.11.1-fasrc01 && conda activate mtb_isolates &&     export PATH="/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools/scripts:/n/home12/pculviner/.conda/envs/mtb_isolates/bin/:$PATH" &&     export PYTHONPATH="${PYTHONPATH}:/n/boslfs02/LABS/sfortune_lab/Lab/culviner/bin/mtbvartools" &&     write_snp_fastas.py     --input-csv scratch/subtree_test//supertree/supertree_inputs.csv     --input-fasta /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/pipeline_inputs/Mtb_h37rv.fasta     --inputs-dir /n/boslfs02/LABS/sfortune_lab/Lab/culviner/notebooks/240807_isolate_tree_expansion/scratch/     --out-dir scratch/subtree_test//supertree/     --output supertree     --mask rlc_plus_lowmap_marin.bed     --miss-threshold 0.5     --local-threads 10"', returncode=0)

In [69]:
fasta_data = pd.read_csv(f'{work_dir}/supertree/supertree.results.csv', index_col=0).T.loc['supertree']
fasta_data.loc['gb_memory'] = np.ceil(21*fasta_data.n_samples * fasta_data.passing_output_sites / 1e+9).astype(int)

In [71]:
threads = 24
cmd = f'\
    sbatch -c {threads} -p shared -t 5:00:00 --mem={fasta_data.gb_memory.astype(int)}G -o {work_dir}/supertree/tree-%j.out --wrap="\
    export OMP_NUM_THREADS={threads} && \
    FastTreeMP -rawdist -nt {work_dir}/supertree/supertree.fasta > {work_dir}/supertree/supertree.nwk"'
subprocess.run(cmd, shell=True)

Submitted batch job 46449354


CompletedProcess(args='    sbatch -c 24 -p shared -t 5:00:00 --mem=5G -o scratch/subtree_test//supertree/tree-%j.out --wrap="    export OMP_NUM_THREADS=24 &&     FastTreeMP -rawdist -nt scratch/subtree_test//supertree/supertree.fasta > scratch/subtree_test//supertree/supertree.nwk"', returncode=0)

In [100]:
# load the tree
tree = dendropy.Tree.get_from_path(
    f'{work_dir}/supertree/supertree.nwk', 'newick')
# root at canettii
tree.reroot_at_node(
    tree.find_node_with_taxon_label('canettii'))
tree.prune_taxa_with_labels('canettii')
# ladderize
tree.ladderize()
# rescale edges
for edge in tree.edges():
    if edge.length != None:
        edge.length = edge.length * (fasta_data.passing_output_sites / fasta_data.genome_len)
tree.write(
    path=f'{work_dir}/supertree/supertree.norm.nwk',
    schema='newick')
all_leaves += [l.taxon.label for l in tree.leaf_nodes()]

### Compare tree branch lengths

In [101]:
supertree = dendropy.Tree.get_from_path(
    f'{work_dir}/supertree/supertree.norm.nwk', 'newick')

In [102]:
edge_comparisons = []
for st_name in subtree_list:
    subtree = dendropy.Tree.get_from_path(
        f'{work_dir}/{st_name}/{st_name}.norm.nwk', 'newick')
    for n in subtree.leaf_nodes():
        supertree_len = supertree.find_node_with_taxon_label(n.taxon.label).edge_length
        subtree_len = n.edge_length
        edge_comparisons.append([supertree_len, subtree_len])

edge_comparisons = pd.DataFrame(
    data=edge_comparisons,
    columns=['super_len', 'sub_len'])

In [106]:
edge_comparisons

Unnamed: 0,super_len,sub_len
0,5.485550e-05,5.485035e-05
1,2.641357e-05,2.640955e-05
2,3.066918e-05,3.088379e-05
3,5.216077e-05,5.215450e-05
4,6.422262e-05,6.432705e-05
...,...,...
1395,0.000000e+00,0.000000e+00
1396,1.551332e-10,2.821129e-11
1397,1.134706e-06,1.134455e-06
1398,1.551332e-10,2.821129e-11


### Edit supertree to use subtree as branches

In [127]:
supertree = dendropy.Tree.get_from_path(
    f'{work_dir}/supertree/supertree.norm.nwk', 'newick')
supertree

<Tree object at 0x14f23a7125d0>

In [146]:
for st_name in subtree_list:
    subtree = dendropy.Tree.get_from_path(  # get the subtree
        f'{work_dir}/{st_name}/{st_name}.norm.nwk', 'newick')
    mrca = supertree.mrca(  # get the supertree defining nodes on the subtree
        taxon_labels=[l.taxon.label for l in subtree.leaf_nodes()])
    supertree.prune_nodes(mrca.child_nodes())
    supertree

In [148]:
print(supertree)

[&R] ((:8.657455476918221e-06,:1.5513318275827988e-10)1.000:0.00011046742293833524,((:5.892035847750849e-05,(:5.193772084164866e-05,:3.073331072969662e-05)1.000:2.7092365956996346e-05)1.000:1.2485925240936708e-05,(:2.0194865411834256e-05,(:3.563195124165483e-05,(:2.66109256376243e-05,(:1.1800205546508559e-05,(:3.10831670834531e-05,(:3.176413970248884e-06,(:2.497613215771755e-06,(:1.633266969388412e-05,:4.766746279977115e-06)1.000:4.348600299170446e-06)1.000:2.0142368342788855e-05)1.000:2.716382030097481e-06)1.000:2.950478002879725e-06)0.997:1.5513318275827988e-10)0.424:1.3157217549368337e-05)1.000:2.1392865902366797e-07)0.740:3.701335018084421e-05)1.000:6.223754029779225e-05)canettii;
