In [8]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from Bio import AlignIO
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from scipy.stats import fisher_exact
from statsmodels.stats import multitest

In this analysis I'm testing the significance of SNP enrichment within clusters, but have only considered SNPs that are homoplasic (occur at least twice in the tree). 

### A. significance of NS SNPs in clusters:

In [3]:
# read in summary of nonsynonymous hits in dominant clusters:
homoplasy_dominant_clust_enrich=pd.read_csv("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/hits_summaries/MAB_homoplasies_enriched_dominant_clusters_nonsynonymousOnly.csv", sep=',')


In [4]:
len(homoplasy_dominant_clust_enrich)

6481

In [5]:
homoplasy_dominant_clust_enrichment=homoplasy_dominant_clust_enrich[homoplasy_dominant_clust_enrich.homoplasy_count>=2].sort_values(by='enrichment_p', ascending=True)

In [7]:
homoplasy_dominant_clust_enrichment.homoplasy_count.value_counts()

2     2389
3     1456
4      953
5      669
6      489
7      288
8      145
9       60
10      22
11       8
20       1
12       1
Name: homoplasy_count, dtype: int64

In [11]:
pd.set_option('display.max_colwidth', None)
homoplasy_dominant_clust_enrichment.head(20)

Unnamed: 0.1,Unnamed: 0,Position,major,minor,Gene,homoplasy_count,Type,Product,Change,enrichment_p,OR
0,9820,3381893,C,G,MAB_3334c,4,intragenic,Aspartyl/glutamyl-tRNA(Asn/Gln) amidotransferase subunit B,NS,1.034483e-65,inf
1,9600,3280500,G,A,MAB_3244,4,intragenic,hypothetical protein,NS,1.71614e-64,inf
2,10171,3520983,G,T,MAB_3480,6,intragenic,hypothetical protein,NS,7.209222e-64,inf
3,12035,4207364,A,G,MAB_4141,2,intragenic,hypothetical protein,NS,1.57779e-62,inf
4,12022,4206743,A,G,MAB_4141,2,intragenic,hypothetical protein,NS,1.57779e-62,inf
5,12015,4206270,A,G,MAB_4141,2,intragenic,hypothetical protein,NS,1.57779e-62,inf
6,9593,3279401,C,T,MAB_3242,4,intragenic,Isopentenyl-diphosphate delta-isomerase,NS,8.071652e-62,inf
7,10174,3522759,A,C,MAB_3481,8,intragenic,hypothetical protein,NS,2.085937e-61,inf
8,12013,4206170,G,A,MAB_4141,2,intragenic,hypothetical protein,NS,1.009786e-60,inf
9,11720,4105392,T,A,MAB_4057c,3,intragenic,D-inositol 3-phosphate glycosyltransferase,NS,2.781329e-60,inf


In [None]:
# collapse variants in the same gene:
homoplasy_dominant_clust_enrichment

In [10]:
# significance threshold testing
fdr=multitest.fdrcorrection(homoplasy_dominant_clust_enrichment.enrichment_p)

In [63]:
pd.DataFrame(fdr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6471,6472,6473,6474,6475,6476,6477,6478,6479,6480
0,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,6.70449e-62,5.56115e-61,1.55743e-60,1.70428e-59,1.70428e-59,1.70428e-59,7.4732e-59,1.68987e-58,7.27158e-58,1.80258e-57,...,1,1,1,1,1,1,1,1,1,1


In [64]:
sum(fdr[0])

5323

In [65]:
sum(fdr[0])/len(fdr[0])

0.8213238697731832

In [66]:
0.05/len(homoplasy_dominant_clust_enrichment)

7.71485881808363e-06

In [67]:
homoplasy_dominant_clust_enrichment.enrichment_p[homoplasy_dominant_clust_enrichment.enrichment_p<0.05/len(homoplasy_dominant_clust_enrichment)]

0       1.034483e-65
1       1.716140e-64
2       7.209222e-64
3       1.577790e-62
4       1.577790e-62
            ...     
3188    7.552118e-06
3189    7.556738e-06
3190    7.570790e-06
3191    7.639187e-06
3192    7.667898e-06
Name: enrichment_p, Length: 3193, dtype: float64

### A high proportion of NS variants are significantly enriched within clusters. Now I want to see if this is higher among NS variants v. S variants:

In [68]:
# read in summary of nonsynonymous hits in dominant clusters:
homoplasy_dominant_clust_all_enrich=pd.read_csv("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/hits_summaries/MAB_homoplasies_enriched_dominant_clusters.csv", sep=',')


In [69]:
len(homoplasy_dominant_clust_all_enrich)

19741

In [70]:
homoplasy_dominant_clust_all_enrich.head()

Unnamed: 0.1,Unnamed: 0,Position,major,minor,Gene,homoplasy_count,Type,Product,enrichment_p,OR
0,13841,1217813,C,T,MAB_1203,3,intragenic,Mycothiol S-conjugate amidase,4.254548e-67,2914.0
1,38491,3381893,C,G,MAB_3334c,4,intragenic,Aspartyl/glutamyl-tRNA(Asn/Gln) amidotransferase subunit B,1.034483e-65,inf
2,24054,2119889,G,T,MAB_2120c,6,intragenic,Putative inactive phenolphthiocerol synthesis polyketide synthase type I Pks15,1.7090889999999997e-64,inf
3,37476,3280500,G,A,MAB_3244,4,intragenic,hypothetical protein,1.71614e-64,inf
4,37480,3280832,G,A,MAB_3244,4,intragenic,hypothetical protein,1.71614e-64,inf


In [71]:
# first let's merge with the mutation events table to get the change information so I can confirm which changes are synonymous
recombFree_mutation_events=pd.read_csv('../snppar_output/mab_recombinationFree/mab_recombFree_snppar_all_mutation_events.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [72]:
recombFree_mutation_events.head()

Unnamed: 0,Position,Type,Ancestor_Node,Derived_Node,Ancestor_Call,Derived_Call,Gene,Strand,Codon,Codon_Position,...,Derived_Codon,Ancestor_A.A.,Derived_A.A.,Change,Up_Gene,Up_Gene_Strand,Up_Gene_Distance,Down_Gene,Down_Gene_Strand,Down_Gene_Distance
0,87,intragenic,NODE_0000002,SAMEA2259646,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
1,87,intragenic,NODE_0000013,SAMEA2071363,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
2,87,intragenic,NODE_0000039,NODE_0000040,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
3,87,intragenic,NODE_0000057,NODE_0000058,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-
4,87,intragenic,NODE_0000061,SAMN09758982,G,A,MAB_0001,1,29,3,...,TCA,S,S,S,-,-,-,-,-,-


In [73]:
mut_events_sparse=recombFree_mutation_events[['Ancestor_Call', 'Derived_Call', 'Position', 'Change']].rename(columns={"Ancestor_Call": "major", "Derived_Call": "minor"})
mut_events_sparse=mut_events_sparse.drop_duplicates()


In [74]:
enriched_vars_change=homoplasy_dominant_clust_all_enrich.merge(mut_events_sparse, how="left", on=['major', 'minor', 'Position'])

In [75]:
enriched_vars_change[enriched_vars_change.Change=='NS']

Unnamed: 0.1,Unnamed: 0,Position,major,minor,Gene,homoplasy_count,Type,Product,enrichment_p,OR,Change
1,38491,3381893,C,G,MAB_3334c,4,intragenic,Aspartyl/glutamyl-tRNA(Asn/Gln) amidotransferase subunit B,1.034483e-65,inf,NS
3,37476,3280500,G,A,MAB_3244,4,intragenic,hypothetical protein,1.716140e-64,inf,NS
6,39890,3520983,G,T,MAB_3480,6,intragenic,hypothetical protein,7.209222e-64,inf,NS
10,37460,3279401,C,T,MAB_3242,4,intragenic,Isopentenyl-diphosphate delta-isomerase,8.071652e-62,inf,NS
11,39909,3522759,A,C,MAB_3481,8,intragenic,hypothetical protein,2.085937e-61,inf,NS
...,...,...,...,...,...,...,...,...,...,...,...
19796,27487,2426048,C,A,MAB_2369,4,intragenic,Segregation and condensation protein B,1.000000e+00,,NS
19803,11616,1023325,A,G,MAB_1014c,3,intragenic,D-inositol-3-phosphate glycosyltransferase,1.000000e+00,,NS
19804,11613,1023046,A,G,MAB_1014c,4,intragenic,D-inositol-3-phosphate glycosyltransferase,1.000000e+00,,NS
19805,11615,1023240,T,C,MAB_1014c,4,intragenic,D-inositol-3-phosphate glycosyltransferase,1.000000e+00,,NS


In [76]:
synonymous_vars=enriched_vars_change[enriched_vars_change.Change=='S']

In [77]:
synonymous_vars.homoplasy_count.value_counts()

3     4396
4     3257
5     2637
6     1802
7     1210
8      603
9      275
10      82
11      29
12       3
13       1
Name: homoplasy_count, dtype: int64

In [78]:
s_fdr=multitest.fdrcorrection(synonymous_vars.enrichment_p)

In [79]:
sum(s_fdr[0])/len(s_fdr[0])

0.8384749912556838