In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# note ete3 is not fully compatible with python 3.7. I was able to use it still in my base environment, 
# but was unable to install it in the homoplasy environment. In the future I may need to use ete3 in a separate
# environment with an earlier version of python. 
from ete3 import Tree
from Bio import AlignIO
from cyvcf2 import VCF

###### Purpose: 
prepare input data for using SNPPar according to requirements described here: https://github.com/d-j-e/SNPPar

20210508NC

### 1. [Get full-length recombination-free alignment](#1)
### 2. [Get list of SNP positions](#2)
### 3. [Use SNP sites to get SNP alignments](#3)
### 4. [Convert residual Ns to -s](#4)

<a id="1"></a>
### 1. Get a full length, recombination-free alignment

Currently I only have the standard output of Gubbins, which is a recombination-free SNP alignment. Because I want to know the position of each SNP in the alignment, I need to start with a full-length alignment, but I also want to exclude recombination events. Here I use a script from Nick Croucher to use mask the full length alignment using the Gubbins output. Script can be found here: https://github.com/sanger-pathogens/gubbins/tree/masking_aln




In [3]:
!python3 ../scripts/mask_gubbins_aln.py -h

usage: mask_gubbins_aln [-h] --aln ALN --gff GFF --out OUT [--out-fmt OUT_FMT]
                        [--missing-char MISSING_CHAR]

Mask recombinant regions detected by Gubbins from the input alignment

optional arguments:
  -h, --help            show this help message and exit
  --aln ALN             Input alignment (FASTA format)
  --gff GFF             GFF of recombinant regions detected by Gubbins
  --out OUT             Output file name
  --out-fmt OUT_FMT     Format of output alignment
  --missing-char MISSING_CHAR
                        Character used to replace recombinant sequence


In [4]:
mas_input_alignment='/n/data1/hms/dbmi/farhat/nikki/abscessus/fasta_for_gubbins/mas_MSA_for_Gubbins_w_outgroup_ref.fasta'
bol_input_alignment='/n/data1/hms/dbmi/farhat/nikki/abscessus/fasta_for_gubbins/bol_MSA_for_Gubbins_w_outgroup_ref.fasta'

In [5]:
mas_gubbins_gff='/n/data1/hms/dbmi/farhat/nikki/abscessus/gubbins/mas/raxml/Gubbins_run1/mas_raxml.recombination_predictions.gff'
bol_gubbins_gff='/n/data1/hms/dbmi/farhat/nikki/abscessus/gubbins/bol/raxml/gubbins_run2_rem_lowq/bol_raxml_run2.recombination_predictions.gff'

In [6]:
MAS_full_masked_aln_for_snpPar='../vars/mas_fullLengthAln_gubbinsMasked_for_snpPar_20210519.fasta'
BOL_full_masked_aln_for_snpPar='../vars/bol_fullLengthAln_gubbinsMasked_for_snpPar_20210519.fasta'

In [8]:
#!python3 ../scripts/mask_gubbins_aln.py --aln $bol_input_alignment --gff $bol_gubbins_gff --out $BOL_full_masked_aln_for_snpPar --missing-char '-' --out-fmt 'fasta'

In [57]:
#!python3 ../scripts/mask_gubbins_aln.py --aln $mas_input_alignment --gff $mas_gubbins_gff --out $MAS_full_masked_aln_for_snpPar --missing-char '-' --out-fmt 'fasta'

** Note: in this masked alignment, Ns represent sites I masked previously because of quality issues, and - represent recombination. This is the opposite of how Gubbins outputs its alignment, so be sure to account for that later when tabulating the % of the genome that is predicted recombination regions.

<a id="2"></a>
## 2. Use SNP sites to generate VCFs from the SNP data and get the SNP positions

### First make sure the isolates in the tree match the isolates in the fasta file:

Now I have two alignments, input alignment and full_masked_aln_for_snpPar

In [10]:
mas_tree_path='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/working_trees/mas/mas_iqtree_noOutgroup_ROOTED.tree'
bol_tree_path='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/working_trees/bol/bol_iqtree_noOutgroup_ROOTED.tree'

In [11]:
# read in the tree
mas_tree=Tree(mas_tree_path, format=0)
bol_tree=Tree(bol_tree_path, format=0)


#### [2.1] MAS:

In [20]:
# get a list of all the isolates in the tree
mas_isolates_in_tree=[l.name for l in mas_tree.get_leaves()]

In [21]:
len(mas_isolates_in_tree)

169

In [22]:
# write isolate list to a text file
# with open('../vars/isolates_in_mas_tree.txt', 'w') as filehandle:
#     for isolate in mas_isolates_in_tree:
#         filehandle.write('%s\n' % isolate)

In [19]:
mas_input_aln_filtered='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_unmasked_msa_treeIsolatesFiltered.fasta'
mas_masked_aln_filtered='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_masked_msa_treeIsolatesFiltered.fasta'

2.1.A. Filter the unmasked alignment:

In [24]:
#use seqkit to subset the fasta file
#!seqkit grep -f ../vars/isolates_in_mas_tree.txt $mas_input_alignment > $mas_input_aln_filtered

In [25]:
# double check I have the right number of sequences in the fasta file
#!grep ">" $mas_input_aln_filtered | wc -l

2.1.B. Filter the recombination free alignment:

In [26]:
#use seqkit to subset the fasta file
#!seqkit grep -f ../vars/isolates_in_mas_tree.txt $MAS_full_masked_aln_for_snpPar > $mas_masked_aln_filtered

In [27]:
# double check I have the right number of sequences in the fasta file
#!grep ">" $mas_masked_aln_filtered | wc -l

#### [2.2] BOL:

In [12]:
bol_isolates_in_tree=[l.name for l in bol_tree.get_leaves()]
# write isolate list to a text file
# with open('../vars/isolates_in_bol_tree.txt', 'w') as filehandle:
#     for isolate in bol_isolates_in_tree:
#         filehandle.write('%s\n' % isolate)

In [13]:
bol_input_aln_filtered='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_unmasked_msa_treeIsolatesFiltered.fasta'
bol_masked_aln_filtered='/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_masked_msa_treeIsolatesFiltered.fasta'

2.2.A. Filter the unmasked alignment:

In [14]:
# #use seqkit to subset the fasta file
# !seqkit grep -f ../vars/isolates_in_bol_tree.txt $bol_input_alignment > $bol_input_aln_filtered
# # double check I have the right number of sequences in the fasta file
# !grep ">" $bol_input_aln_filtered | wc -l

2.1.B. Filter the recombination free alignment:

In [15]:
#use seqkit to subset the fasta file
!seqkit grep -f ../vars/isolates_in_bol_tree.txt $BOL_full_masked_aln_for_snpPar > $bol_masked_aln_filtered
# double check I have the right number of sequences in the fasta file
!grep ">" $bol_masked_aln_filtered | wc -l

59


### Use snp-sites to get an output VCF:

##### [MAS]: 

A. Unmasked alignment:

In [20]:
mas_unmasked_vcf="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_snpSites_unmasked.vcf"
!snp-sites -v -o $mas_unmasked_vcf $mas_input_aln_filtered

B. Masked alignment:

In [22]:
mas_masked_vcf="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mab_snpSites_masked.vcf"
!snp-sites -v -o $mas_masked_vcf $mas_masked_aln_filtered 

##### [BOL]:

In [24]:
bol_unmasked_vcf="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_snpSites_unmasked.vcf"
!snp-sites -v -o $bol_unmasked_vcf $bol_input_aln_filtered

In [25]:
bol_masked_vcf="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_snpSites_masked.vcf"
!snp-sites -v -o $bol_masked_vcf $bol_masked_aln_filtered 

### parse VCF for all the SNP positions

Get snp positions from unmasked alignment:

In [16]:
#pip install cyvcf2

##### [MAS]:

In [27]:
mas_snp_pos=[]
mas_v1=VCF(mas_unmasked_vcf)
for v in mas_v1:
    mas_snp_pos.append(v.POS)
    
# convert into a position file 
with open('../vars/mas_unmasked_snp_positions.txt', 'w') as filehandle:
    for pos in mas_snp_pos:
        filehandle.write('%s\n' % pos)

In [28]:
len(mas_snp_pos)

163056

In [29]:
mas_snp_pos_masked=[]
mas_v2=VCF(mas_masked_vcf)
for v in mas_v2:
    mas_snp_pos_masked.append(v.POS)
    
# convert into a position file 
with open('../vars/mas_masked_snp_positions.txt', 'w') as filehandle:
    for pos in mas_snp_pos_masked:
        filehandle.write('%s\n' % pos)

In [30]:
len(mas_snp_pos_masked)

41718

##### [BOL]:

In [31]:
bol_snp_pos=[]
bol_v1=VCF(bol_unmasked_vcf)
for v in bol_v1:
    bol_snp_pos.append(v.POS)
    
# convert into a position file 
with open('../vars/bol_unmasked_snp_positions.txt', 'w') as filehandle:
    for pos in bol_snp_pos:
        filehandle.write('%s\n' % pos)

In [32]:
len(bol_snp_pos)

113880

In [33]:
bol_snp_pos_masked=[]
bol_v2=VCF(bol_masked_vcf)
for v in bol_v2:
    bol_snp_pos_masked.append(v.POS)
    
# convert into a position file 
with open('../vars/bol_masked_snp_positions.txt', 'w') as filehandle:
    for pos in bol_snp_pos_masked:
        filehandle.write('%s\n' % pos)

In [34]:
len(bol_snp_pos_masked)

56914

<a id="3"></a>
## 3. Use SNP sites to get SNP alignments

### 3.1 MAS

3.1.A. get snps from the unmasked MAS alignment:

In [36]:
mas_unmasked_snp_alignment="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_unmasked_snpAln.fasta"
!snp-sites -o $mas_unmasked_snp_alignment $mas_input_aln_filtered

163056

In [38]:
# check that the alignment length matches the number of snp positions above:
mas_aln_unmasked=AlignIO.read(mas_unmasked_snp_alignment, "fasta")
len(mas_aln_unmasked[0])

163056

3.1.B. get snps from the masked MAS alignment:

In [40]:
mas_masked_snp_alignment="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_masked_snpAln.fasta"
!snp-sites -o $mas_masked_snp_alignment $mas_masked_aln_filtered

In [41]:
# check that the alignment length matches the number of snp positions above:
mas_aln_masked=AlignIO.read(mas_masked_snp_alignment, "fasta")
len(mas_aln_masked[0])

41718

### 3.2 BOL


3.2.A. get snps from the unmasked BOL alignment:

In [42]:
bol_unmasked_snp_alignment="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_unmasked_snpAln.fasta"
!snp-sites -o $bol_unmasked_snp_alignment $bol_input_aln_filtered

In [43]:
# check that the alignment length matches the number of snp positions above:
bol_aln_unmasked=AlignIO.read(bol_unmasked_snp_alignment, "fasta")
len(bol_aln_unmasked[0])

113880

3.2.B. get snps from the masked BOL alignment:

In [44]:
bol_masked_snp_alignment="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_masked_snpAln.fasta"
!snp-sites -o $bol_masked_snp_alignment $bol_masked_aln_filtered

In [45]:
# check that the alignment length matches the number of snp positions above:
bol_aln_masked=AlignIO.read(bol_masked_snp_alignment, "fasta")
len(bol_aln_masked[0])

56914

<a id="4"></a>
### 4. Convert remaining N to -

SNPPar wants an input fasta with all missing or ambiguous sites to be - but my alignments still have some Ns in them. Here I convert the 'N's to '-'s using the script convert_aln_char.py

In [27]:
!python3 ../scripts/convert_aln_char.py -h

usage: convert_aln_char [-h] --in_aln IN_ALN --out_aln OUT_ALN --old_char
                        OLD_CHAR --new_char NEW_CHAR

Remove all instances of one characterfrom an alignment and replace them with a
new character

optional arguments:
  -h, --help           show this help message and exit
  --in_aln IN_ALN      Input alignment (FASTA format)
  --out_aln OUT_ALN    Output file name (FASTA format)
  --old_char OLD_CHAR  character we want to replace
  --new_char NEW_CHAR  character we want to insert instead of --from_char


In [46]:
mas_unmasked_snp_aln_forSnpPar="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_unmasked_snpAln_forSnpPar.fasta"
!python3 ../scripts/convert_aln_char.py --in_aln $mas_unmasked_snp_alignment --out_aln $mas_unmasked_snp_aln_forSnpPar --old_char 'N' --new_char '-' 

In [47]:
mas_masked_snp_aln_forSnpPar="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_masked_snpAln_forSnpPar.fasta"
!python3 ../scripts/convert_aln_char.py --in_aln $mas_masked_snp_alignment --out_aln $mas_masked_snp_aln_forSnpPar --old_char 'N' --new_char '-' 

In [48]:
bol_unmasked_snp_aln_forSnpPar="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_unmasked_snpAln_forSnpPar.fasta"
!python3 ../scripts/convert_aln_char.py --in_aln $bol_unmasked_snp_alignment --out_aln $bol_unmasked_snp_aln_forSnpPar --old_char 'N' --new_char '-' 

In [49]:
bol_masked_snp_aln_forSnpPar="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_masked_snpAln_forSnpPar.fasta"
!python3 ../scripts/convert_aln_char.py --in_aln $bol_masked_snp_alignment --out_aln $bol_masked_snp_aln_forSnpPar --old_char 'N' --new_char '-' 

#### 5. convert to MFASTA format (one entry per 2 lines)

In [51]:
mas_masked_snpAln_unwrapped="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_masked_snpAln_unwrapped.fasta"
mas_unmasked_snpAln_unwrapped="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/mas_unmasked_snpAln_unwrapped.fasta"

bol_masked_snpAln_unwrapped="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_masked_snpAln_unwrapped.fasta"
bol_unmasked_snpAln_unwrapped="/n/data1/hms/dbmi/farhat/nikki/abscessus/0_NOTEBOOKS/010_homoplasy/vars/bol_unmasked_snpAln_unwrapped.fasta"

In [52]:
!seqkit seq -w 0 $mas_masked_snp_aln_forSnpPar > $mas_masked_snpAln_unwrapped

In [53]:
!seqkit seq -w 0 $mas_unmasked_snp_aln_forSnpPar > $mas_unmasked_snpAln_unwrapped

In [54]:
!seqkit seq -w 0 $bol_masked_snp_aln_forSnpPar > $bol_masked_snpAln_unwrapped

In [55]:
!seqkit seq -w 0 $bol_unmasked_snp_aln_forSnpPar > $bol_unmasked_snpAln_unwrapped