In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from Bio import AlignIO
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from scipy.stats import fisher_exact

### A. Dominant clusters only: 

In [3]:
# read in summary of nonsynonymous hits in dominant clusters:
homoplasy_dominant_clust_enrich=pd.read_csv("/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/hits_summaries/MAB_homoplasies_enriched_dominant_clusters_nonsynonymousOnly.csv", sep=',')


In [4]:
homoplasy_dominant_clust_enrich_top=homoplasy_dominant_clust_enrich[homoplasy_dominant_clust_enrich.homoplasy_count>=2].sort_values(by='enrichment_p', ascending=True).iloc[0:50,:]

In [5]:
pd.set_option('display.max_colwidth', None)
homoplasy_dominant_clust_enrich_top

Unnamed: 0.1,Unnamed: 0,Position,major,minor,Gene,homoplasy_count,Type,Product,Change,enrichment_p,OR
0,9820,3381893,C,G,MAB_3334c,4,intragenic,Aspartyl/glutamyl-tRNA(Asn/Gln) amidotransferase subunit B,NS,1.034483e-65,inf
1,9600,3280500,G,A,MAB_3244,4,intragenic,hypothetical protein,NS,1.71614e-64,inf
2,10171,3520983,G,T,MAB_3480,6,intragenic,hypothetical protein,NS,7.209222e-64,inf
3,12035,4207364,A,G,MAB_4141,2,intragenic,hypothetical protein,NS,1.57779e-62,inf
4,12022,4206743,A,G,MAB_4141,2,intragenic,hypothetical protein,NS,1.57779e-62,inf
5,12015,4206270,A,G,MAB_4141,2,intragenic,hypothetical protein,NS,1.57779e-62,inf
6,9593,3279401,C,T,MAB_3242,4,intragenic,Isopentenyl-diphosphate delta-isomerase,NS,8.071652e-62,inf
7,10174,3522759,A,C,MAB_3481,8,intragenic,hypothetical protein,NS,2.085937e-61,inf
8,12013,4206170,G,A,MAB_4141,2,intragenic,hypothetical protein,NS,1.009786e-60,inf
9,11720,4105392,T,A,MAB_4057c,3,intragenic,D-inositol 3-phosphate glycosyltransferase,NS,2.781329e-60,inf


In [6]:
# get a dictionary matching each SNP position to the gene name
pos_gene_dict=homoplasy_dominant_clust_enrich_top[['Position', 'Gene']].set_index('Position').to_dict('index')
pos_gene_dict

{3381893: {'Gene': 'MAB_3334c'},
 3280500: {'Gene': 'MAB_3244'},
 3520983: {'Gene': 'MAB_3480'},
 4207364: {'Gene': 'MAB_4141'},
 4206743: {'Gene': 'MAB_4141'},
 4206270: {'Gene': 'MAB_4141'},
 3279401: {'Gene': 'MAB_3242'},
 3522759: {'Gene': 'MAB_3481'},
 4206170: {'Gene': 'MAB_4141'},
 4105392: {'Gene': 'MAB_4057c'},
 3630909: {'Gene': 'MAB_3581c'},
 1063885: {'Gene': 'MAB_1054'},
 3557910: {'Gene': 'MAB_3515c'},
 3254305: {'Gene': 'MAB_3213c'},
 725333: {'Gene': 'MAB_0722'},
 3562503: {'Gene': 'MAB_3516c'},
 1060677: {'Gene': 'MAB_1052c'},
 726866: {'Gene': 'MAB_0723c'},
 1055007: {'Gene': 'MAB_1046c'},
 3563694: {'Gene': 'MAB_3516c'},
 3564177: {'Gene': 'MAB_3517'},
 3560298: {'Gene': 'MAB_3515c'},
 127291: {'Gene': 'MAB_0129c'},
 1057676: {'Gene': 'MAB_1049'},
 4106059: {'Gene': 'MAB_4058c'},
 4204784: {'Gene': 'MAB_4139'},
 4099736: {'Gene': 'MAB_4052c'},
 3015898: {'Gene': 'MAB_2960'},
 539141: {'Gene': 'MAB_0538'},
 1285623: {'Gene': 'MAB_1280c'},
 4204786: {'Gene': 'MAB_4139'

In [7]:
# get a dictionary mapping each snp position in the genome to the index in the SNP alignment
snp_pos=[line.rstrip('\n') for line in open('/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mab_masked_snp_positions.txt')]
snp_pos_series=pd.Series(snp_pos)
snp_pos_dict=dict(zip(snp_pos_series, snp_pos_series.index))

masked_snpAln_unwrapped="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mab_masked_snpAln_unwrapped.fasta"
aln_masked=AlignIO.read(masked_snpAln_unwrapped, "fasta")

In [8]:
# make a table with the top 50 variants with a binary variable describing whether the minor or major allele is present for each sample
allele_dict={}
gene_list=[]
allele_dict['name']=[i.id for i in aln_masked] # add the sample names to an empty dictionary where I will put the allele information

for pos, minor in zip(homoplasy_dominant_clust_enrich_top.Position, homoplasy_dominant_clust_enrich_top.minor):
    snp_position=snp_pos_dict[str(pos)] # get mapping of genomic position onto the snp index in the alignment
    minor_allele_list=[1 if i==minor else 0 for i in list(aln_masked[:,snp_position])] # for that snp position, get a list telling us whether each sample has the minor allele or not
    allele_dict[str(pos)+'_'+minor]=minor_allele_list
    gene_list.append(pos_gene_dict[pos]['Gene'])

In [9]:
dominant_clust_snp_table=pd.DataFrame.from_dict(allele_dict).iloc[:,0:51].set_index('name')
dominant_clust_snp_table.head()

Unnamed: 0_level_0,3381893_G,3280500_A,3520983_T,4207364_G,4206743_G,4206270_G,3279401_T,3522759_C,4206170_A,4105392_A,...,975717_G,3237743_A,4108981_C,4193860_G,1879643_G,2124705_T,127360_A,3562673_C,4198370_T,4206143_G
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA1317694,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
SAMEA1464939,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,0,1,0
SAMEA1464888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
SAMEA1464902,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,0,1,0
SAMEA1464890,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,0,1,0


In [10]:
dominant_clust_snp_table.columns=gene_list
dominant_clust_snp_table

Unnamed: 0_level_0,MAB_3334c,MAB_3244,MAB_3480,MAB_4141,MAB_4141,MAB_4141,MAB_3242,MAB_3481,MAB_4141,MAB_4057c,...,MAB_0968c,MAB_3192c,MAB_4061c,MAB_4128c,MAB_1881c,MAB_2122,MAB_0129c,MAB_3516c,MAB_4133c,MAB_4141
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA1317694,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
SAMEA1464939,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,0,1,0
SAMEA1464888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
SAMEA1464902,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,0,1,0
SAMEA1464890,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMEA5396772,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
SAMEA5396773,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA5396791,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,1,1,0,1,0
SAMEA5396796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
dominant_clust_snp_table.to_csv('/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/data/20210902_dominant_cluster_enrichment_snp_table_NS.csv', sep=',')

In [14]:
!head -500 /n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/data/20210902_dominant_cluster_enrichment_snp_table_NS.csv

name,MAB_3334c,MAB_3244,MAB_3480,MAB_4141,MAB_4141,MAB_4141,MAB_3242,MAB_3481,MAB_4141,MAB_4057c,MAB_3581c,MAB_1054,MAB_3515c,MAB_3213c,MAB_0722,MAB_3516c,MAB_1052c,MAB_0723c,MAB_1046c,MAB_3516c,MAB_3517,MAB_3515c,MAB_0129c,MAB_1049,MAB_4058c,MAB_4139,MAB_4052c,MAB_2960,MAB_0538,MAB_1280c,MAB_4139,MAB_1268c,MAB_3000,MAB_1205,MAB_3531,MAB_2463,MAB_4148c,MAB_4914c,MAB_1870,MAB_0969,MAB_0968c,MAB_3192c,MAB_4061c,MAB_4128c,MAB_1881c,MAB_2122,MAB_0129c,MAB_3516c,MAB_4133c,MAB_4141
SAMEA1317694,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0
SAMEA1464939,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,0,1,0
SAMEA1464888,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
SAMEA1464902,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,0,1,0
SAMEA1464890,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1