In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [22]:
from pairsnp import calculate_snp_matrix, calculate_distance_matrix
from Bio import Phylo, AlignIO, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
import pandas as pd
import numpy as np

In [3]:
# Get a list of all the clustered samples as defined by TreeGubbins
filtered_cluster_data=pd.read_csv('/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/working_trees/mab/mab_upid_dropped_outgroup_and_outlier_distance_rooted_TreeGubbins_filtered.csv')
clustered_samples=list(filtered_cluster_data.BioSample)

In [4]:
# Paths to alignments and metadata that I need
full_tree_path='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/working_trees/mab/20200731_mab_upid_droppedOutliers.fasta'
treemmer_tree_path='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/001_TEMPORAL_SIGNAL/treemmer/fasta/20200924_MAB_all_samples_treemer_trimmed95RTL.fasta'
clusterA_list_path='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/metadata/clusters/mab_clusterA.txt'
clusterB_list_path='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/metadata/clusters/mab_clusterB.txt'

In [5]:
# read in the alignments for the full tree and treemmer tree
full_tree_aln=AlignIO.read(full_tree_path, "fasta")
treemmer_tree_aln=AlignIO.read(treemmer_tree_path, "fasta")

In [6]:
# read in lists of samples within clusters
clustA_list=[line.strip() for line in open(clusterA_list_path)]
clustB_list=[line.strip() for line in open(clusterB_list_path)]

In [7]:
# subset the full tree fasta file including only the samples in clust A and clustB, respectively
clustA_seq_list=[seq for seq in full_tree_aln if seq.id in clustA_list]
clustB_seq_list=[seq for seq in full_tree_aln if seq.id in clustB_list]

# reformat into MSA
clustA_aln=MultipleSeqAlignment(clustA_seq_list)
clustB_aln=MultipleSeqAlignment(clustB_seq_list)

In [8]:
# subset the full tree fasta file for everything outside of a cluster
unclustered_seq_list=[seq for seq in full_tree_aln if seq.id not in clustered_samples]
unclustered_aln=MultipleSeqAlignment(unclustered_seq_list)

In [17]:
## write new MSAs to file:
#!mkdir tree_subset_alignments
SeqIO.write(clustA_aln, "/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/014-2_tree_simulation_msprime/tree_subset_alignments/clustA_aln.fasta", "fasta")
SeqIO.write(clustB_aln, "/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/014-2_tree_simulation_msprime/tree_subset_alignments/clustB_aln.fasta", "fasta")
SeqIO.write(unclustered_aln, "/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/014-2_tree_simulation_msprime/tree_subset_alignments/unclustered_aln.fasta", "fasta")

55

In [20]:
## snp dists for clusterA only:
clustA_sparse_matrix, clustA_consensus, seq_names = calculate_snp_matrix("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/014-2_tree_simulation_msprime/tree_subset_alignments/clustA_aln.fasta")
clustA_dist_matrix = calculate_distance_matrix(clustA_sparse_matrix, clustA_consensus, "dist", False)

## snp dists for clusterB only:
clustB_sparse_matrix, clustB_consensus, seq_names = calculate_snp_matrix("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/014-2_tree_simulation_msprime/tree_subset_alignments/clustB_aln.fasta")
clustB_dist_matrix = calculate_distance_matrix(clustB_sparse_matrix, clustB_consensus, "dist", False)

## snp dists for unclustered only:
unclustered_sparse_matrix, unclustered_consensus, seq_names = calculate_snp_matrix("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/014-2_tree_simulation_msprime/tree_subset_alignments/unclustered_aln.fasta")
unclustered_dist_matrix = calculate_distance_matrix(unclustered_sparse_matrix, unclustered_consensus, "dist", False)

In [26]:
np.fill_diagonal(clustA_dist_matrix, np.nan)
np.fill_diagonal(clustB_dist_matrix, np.nan)
np.fill_diagonal(unclustered_dist_matrix, np.nan)

In [31]:
clustA_mean_dist=np.nanmean(clustA_dist_matrix)
clustA_mean_dist

56.95698618430423

In [32]:
clustB_mean_dist=np.nanmean(clustB_dist_matrix)
clustB_mean_dist

53.713871635610765

In [33]:
unclustered_mean_dist=np.nanmean(unclustered_dist_matrix)
unclustered_mean_dist

5267.358249158249

In [34]:
clustA_mean_dist/unclustered_mean_dist

0.010813197715079707

In [35]:
clustB_mean_dist/unclustered_mean_dist

0.010197497321203569