In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
from Bio import AlignIO
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from scipy.stats import fisher_exact

### A. Dominant clusters only: 

In [5]:
# read in summary of nonsynonymous hits in dominant clusters:
homoplasy_dominant_clust_enrich=pd.read_csv("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/hits_summaries/MAB_homoplasies_enriched_all_clusters_nonsynonymousOnly.csv", sep=',')


In [6]:
homoplasy_dominant_clust_enrich_top=homoplasy_dominant_clust_enrich[homoplasy_dominant_clust_enrich.homoplasy_count>2].sort_values(by='enrichment_p', ascending=True).iloc[0:50,:]

In [7]:
pd.set_option('display.max_colwidth', None)
homoplasy_dominant_clust_enrich_top

Unnamed: 0.1,Unnamed: 0,Position,major,minor,Gene,homoplasy_count,Type,Product,Change,enrichment_p,OR
0,8089,2792483,A,G,MAB_2746c,8,intragenic,putative cysteine desulfurase,NS,9.187823000000001e-33,258.611111
1,2972,1015358,C,T,MAB_1008c,4,intragenic,hypothetical protein,NS,1.6911790000000002e-29,427.357143
2,11710,4101165,C,G,MAB_4053c,6,intragenic,putative oxidoreductase EphD,NS,8.107312e-28,378.0
3,8724,3015898,C,T,MAB_2960,5,intragenic,hypothetical protein,NS,1.6137419999999998e-26,143.0
4,12042,4208092,T,C,MAB_4142c,4,intragenic,hypothetical protein,NS,4.3564959999999996e-26,201.6
5,3480,1219200,G,C,MAB_1205,3,intragenic,hypothetical protein,NS,8.245759e-26,119.205882
6,13480,4723353,C,G,MAB_4638,6,intragenic,Galactokinase,NS,5.517847e-25,0.0
7,8111,2798122,G,A,MAB_2751,3,intragenic,Alpha-(1->6)-mannopyranosyltransferase,NS,3.624668e-24,154.098361
8,2994,1021957,C,A,MAB_1013,5,intragenic,hypothetical protein,NS,6.621221e-24,130.454545
9,3121,1057676,C,T,MAB_1049,5,intragenic,hypothetical protein,NS,7.042449e-24,113.890625


In [8]:
# get a dictionary matching each SNP position to the gene name
pos_gene_dict=homoplasy_dominant_clust_enrich_top[['Position', 'Gene']].set_index('Position').to_dict('index')
pos_gene_dict

{2792483: {'Gene': 'MAB_2746c'},
 1015358: {'Gene': 'MAB_1008c'},
 4101165: {'Gene': 'MAB_4053c'},
 3015898: {'Gene': 'MAB_2960'},
 4208092: {'Gene': 'MAB_4142c'},
 1219200: {'Gene': 'MAB_1205'},
 4723353: {'Gene': 'MAB_4638'},
 2798122: {'Gene': 'MAB_2751'},
 1021957: {'Gene': 'MAB_1013'},
 1057676: {'Gene': 'MAB_1049'},
 1322464: {'Gene': 'MAB_1320c'},
 2885011: {'Gene': 'MAB_2833'},
 2548457: {'Gene': 'MAB_2493c'},
 4350415: {'Gene': 'MAB_4278'},
 1022317: {'Gene': 'MAB_1013'},
 4111442: {'Gene': 'MAB_4064'},
 3404413: {'Gene': 'MAB_3356c'},
 758980: {'Gene': 'MAB_0758'},
 1063885: {'Gene': 'MAB_1054'},
 4281931: {'Gene': 'MAB_4213c'},
 1022419: {'Gene': 'MAB_1013'},
 1068419: {'Gene': 'MAB_1058'},
 2881443: {'Gene': 'MAB_2829c'},
 4348186: {'Gene': 'MAB_4275c'},
 4106059: {'Gene': 'MAB_4058c'},
 2184462: {'Gene': 'MAB_2169c'},
 2803597: {'Gene': 'MAB_2756c'},
 2803099: {'Gene': 'MAB_2755c'},
 4207353: {'Gene': 'MAB_4141'},
 1060677: {'Gene': 'MAB_1052c'},
 3236506: {'Gene': 'MAB_31

In [9]:
# get a dictionary mapping each snp position in the genome to the index in the SNP alignment
snp_pos=[line.rstrip('\n') for line in open('/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mab_masked_snp_positions.txt')]
snp_pos_series=pd.Series(snp_pos)
snp_pos_dict=dict(zip(snp_pos_series, snp_pos_series.index))

masked_snpAln_unwrapped="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mab_masked_snpAln_unwrapped.fasta"
aln_masked=AlignIO.read(masked_snpAln_unwrapped, "fasta")

In [10]:
# make a table with the top 50 variants with a binary variable describing whether the minor or major allele is present for each sample
allele_dict={}
gene_list=[]
allele_dict['name']=[i.id for i in aln_masked] # add the sample names to an empty dictionary where I will put the allele information

for pos, minor in zip(homoplasy_dominant_clust_enrich_top.Position, homoplasy_dominant_clust_enrich_top.minor):
    snp_position=snp_pos_dict[str(pos)] # get mapping of genomic position onto the snp index in the alignment
    minor_allele_list=[1 if i==minor else 0 for i in list(aln_masked[:,snp_position])] # for that snp position, get a list telling us whether each sample has the minor allele or not
    allele_dict[str(pos)+'_'+minor]=minor_allele_list
    gene_list.append(pos_gene_dict[pos]['Gene'])

In [11]:
dominant_clust_snp_table=pd.DataFrame.from_dict(allele_dict).iloc[:,0:51].set_index('name')
dominant_clust_snp_table.head()

Unnamed: 0_level_0,2792483_G,1015358_T,4101165_G,3015898_T,4208092_C,1219200_C,4723353_G,2798122_A,1021957_A,1057676_T,...,3630909_A,4278532_T,3520983_T,4405885_C,2182742_A,1857888_A,4207085_A,4108981_C,4422047_A,2105967_T
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA1317694,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
SAMEA1464939,1,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,0,1
SAMEA1464888,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA1464902,1,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,0,1
SAMEA1464890,1,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,0,1


In [12]:
dominant_clust_snp_table.columns=gene_list
dominant_clust_snp_table

Unnamed: 0_level_0,MAB_2746c,MAB_1008c,MAB_4053c,MAB_2960,MAB_4142c,MAB_1205,MAB_4638,MAB_2751,MAB_1013,MAB_1049,...,MAB_3581c,MAB_4209c,MAB_3480,MAB_4325c,MAB_2167,MAB_1860,MAB_4141,MAB_4061c,MAB_4341,MAB_2104c
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA1317694,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
SAMEA1464939,1,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,0,1
SAMEA1464888,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA1464902,1,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,0,1
SAMEA1464890,1,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMEA5396772,1,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,0,0,0,0,0
SAMEA5396773,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
SAMEA5396791,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,0,1
SAMEA5396796,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [13]:
dominant_clust_snp_table.to_csv('/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/data/20210902_dominant_cluster_enrichment_snp_table_NS.csv', sep=',')

In [14]:
!head -500 /n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/data/20210902_dominant_cluster_enrichment_snp_table_NS.csv

name,MAB_2746c,MAB_1008c,MAB_4053c,MAB_2960,MAB_4142c,MAB_1205,MAB_4638,MAB_2751,MAB_1013,MAB_1049,MAB_1320c,MAB_2833,MAB_2493c,MAB_4278,MAB_1013,MAB_4064,MAB_3356c,MAB_0758,MAB_1054,MAB_4213c,MAB_1013,MAB_1058,MAB_2829c,MAB_4275c,MAB_4058c,MAB_2169c,MAB_2756c,MAB_2755c,MAB_4141,MAB_1052c,MAB_3191,MAB_4276c,MAB_1046c,MAB_1052c,MAB_2834c,MAB_3607,MAB_4376c,MAB_3084c,MAB_4325c,MAB_4325c,MAB_3581c,MAB_4209c,MAB_3480,MAB_4325c,MAB_2167,MAB_1860,MAB_4141,MAB_4061c,MAB_4341,MAB_2104c
SAMEA1317694,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
SAMEA1464939,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,0,1
SAMEA1464888,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
SAMEA1464902,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,0,1
SAMEA1464890,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,0,1,1,1,1,1,1,1