In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from Bio import AlignIO
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from scipy.stats import fisher_exact
from statsmodels.stats import multitest

In this analysis I'm testing the significance of SNP enrichment within clusters including all SNPs no matter how many times they occurred independently in the tree.

In [3]:
# read in summary of nonsynonymous hits in dominant clusters:
homoplasy_dominant_clust_enrich=pickle.load(open( "../vars/homoplasy_dominant_cluster_enrichment", "rb" ) )

In [4]:
homoplasy_dominant_clust_enrich=homoplasy_dominant_clust_enrich.sort_values(by='enrichment_p', ascending=True)

In [6]:
pd.set_option('display.max_colwidth', None)
homoplasy_dominant_clust_enrich.reset_index()

Unnamed: 0,index,Position,major,minor,Gene,homoplasy_count,Type,Product,enrichment_p,OR
0,53445,4770807,G,A,MAB_4690c,2,intragenic,Linear gramicidin synthase subunit D,1.923486e-71,inf
1,47313,4211627,C,T,MAB_4147c,2,intragenic,hypothetical protein,3.635388e-69,inf
2,13841,1217813,C,T,MAB_1203,3,intragenic,Mycothiol S-conjugate amidase,4.254548e-67,2914.000000
3,38491,3381893,C,G,MAB_3334c,4,intragenic,Aspartyl/glutamyl-tRNA(Asn/Gln) amidotransferase subunit B,1.034483e-65,inf
4,24054,2119889,G,T,MAB_2120c,6,intragenic,Putative inactive phenolphthiocerol synthesis polyketide synthase type I Pks15,1.709089e-64,inf
...,...,...,...,...,...,...,...,...,...,...
54903,12610,1098819,C,T,MAB_1086,1,intragenic,UTP--glucose-1-phosphate uridylyltransferase,1.000000e+00,
54904,45848,4082630,A,T,MAB_4037,2,intragenic,hypothetical protein,1.000000e+00,
54905,45849,4082799,A,G,MAB_4037,2,intragenic,hypothetical protein,1.000000e+00,
54906,45846,4082354,G,A,MAB_4037,2,intragenic,hypothetical protein,1.000000e+00,


In [7]:
# first let's merge with the mutation events table to get the change information so I can confirm which changes are synonymous
recombFree_mutation_events=pd.read_csv('../snppar_output/mab_recombinationFree/mab_recombFree_snppar_all_mutation_events.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
recombFree_mutation_events.head()

Unnamed: 0,Position,Type,Ancestor_Node,Derived_Node,Ancestor_Call,Derived_Call,Gene,Strand,Codon,Codon_Position,...,Derived_Codon,Ancestor_A.A.,Derived_A.A.,Change,Up_Gene,Up_Gene_Strand,Up_Gene_Distance,Down_Gene,Down_Gene_Strand,Down_Gene_Distance
0,87,intragenic,NODE_0000002,SAMEA2259646,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
1,87,intragenic,NODE_0000013,SAMEA2071363,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
2,87,intragenic,NODE_0000039,NODE_0000040,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
3,87,intragenic,NODE_0000057,NODE_0000058,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
4,87,intragenic,NODE_0000061,SAMN09758982,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-


In [12]:
mut_events_sparse=recombFree_mutation_events[['Ancestor_Call', 'Derived_Call', 'Position', 'Change']].rename(columns={"Ancestor_Call": "major", "Derived_Call": "minor"})
mut_events_sparse=mut_events_sparse.drop_duplicates()
mut_events_sparse.head()

Unnamed: 0,major,minor,Position,Change
0,G,A,87,S
6,T,C,162,S
10,G,C,192,S
12,G,A,201,S
18,G,C,219,S


In [32]:
homoplasy_dominant_clust_enrich=homoplasy_dominant_clust_enrich.astype({'Position': 'int64'})
enriched_vars_change=homoplasy_dominant_clust_enrich.merge(mut_events_sparse, how="left", on=['major', 'minor', 'Position'])
enriched_vars_change.head()

Unnamed: 0,Position,major,minor,Gene,homoplasy_count,Type,Product,enrichment_p,OR,Change
0,4770807,G,A,MAB_4690c,2,intragenic,Linear gramicidin synthase subunit D,1.923486e-71,inf,S
1,4211627,C,T,MAB_4147c,2,intragenic,hypothetical protein,3.635388e-69,inf,S
2,1217813,C,T,MAB_1203,3,intragenic,Mycothiol S-conjugate amidase,4.254548e-67,2914.0,S
3,3381893,C,G,MAB_3334c,4,intragenic,Aspartyl/glutamyl-tRNA(Asn/Gln) amidotransferase subunit B,1.034483e-65,inf,NS
4,2119889,G,T,MAB_2120c,6,intragenic,Putative inactive phenolphthiocerol synthesis polyketide synthase type I Pks15,1.7090889999999997e-64,inf,S


In [22]:
# divide the data up into synonymous and nonsynonymous variation
ns_vars=enriched_vars_change[enriched_vars_change.Change=='NS']
s_vars=enriched_vars_change[enriched_vars_change.Change=='S']

In [33]:
# calculate FDR for nonsynonymous SNPs
ns_fdr=multitest.fdrcorrection(ns_vars.enrichment_p)
sum(ns_fdr[0])/len(ns_fdr[0])

0.5845002079578538

In [34]:
# calculate FDR for synonymous SNPs
s_fdr=multitest.fdrcorrection(s_vars.enrichment_p)
sum(s_fdr[0])/len(s_fdr[0])

0.6475437605872388

In [35]:
# calculate FDR for all SNPs combined
fdr=multitest.fdrcorrection(enriched_vars_change.enrichment_p)
sum(fdr[0])/len(fdr[0])

0.6274623827869449

In [36]:
enriched_vars_change.homoplasy_count.value_counts()

1     24535
2      9838
3      6341
4      4571
5      3525
6      2472
7      1592
0       841
8       798
9       359
10      112
11       38
12        4
20        1
13        1
Name: homoplasy_count, dtype: int64