# Module for identifying sites that lie within homopolymer regions

Gage found a few variants in the dataset that were within or near a homopolymer region. I decided to formalize that and write up a set of functions for identifying homopolymer regions. The following code does the following: 

1. Read in a fasta file. For each sequence in the fasta file, for each base (A,T,C,G), output all grep matches for at least 5 of that base in a row. 
2. Given those coordinates, add all of the bases within those coordinates to a list. For each strain, we now have a list of sites that lie within homopolymer regions. 
3. Given a dataframe, for each unique variable site, if it is within a homopolymer region, annotate it as such. 
4. If a site is just outside of a homopolymer region (off by 1 base) annotate as near a homopolymer region 
5. Return the dataframe with an extra column in it

In [92]:
import Bio
from Bio import SeqIO
import re
import pandas as pd 
import numpy as np

In [93]:
def return_homopolymer_dict(fasta_file):
    
    homopolymer_dict = {}

    for seq in SeqIO.parse(fasta_file, "fasta"):

        sequence = str(seq.seq).lower()
        sets_of_homopolymers = {}
        sets_of_homopolymers['all'] = []

        # define a regular expression search for each base to identify stretches of at least 5 in a row
        for base in ['a','t','c','g']:
            local_list = []
            m = re.finditer('[%s]{5,}' % base, sequence)
            for match in m:
                start = match.span()[0] + 1
                stop = match.span()[1] + 1
                
                for i in range(start, stop):
                    local_list.append(i)
                    sets_of_homopolymers['all'].append(i)

            sets_of_homopolymers[base] = local_list

        homopolymer_dict[seq.id] = sets_of_homopolymers
    
    return(homopolymer_dict)

In [96]:
def return_homopolymer_annotation(row, homopolymer_dict):
    
    site = int(row['POS_x'])
    strain = row['strain_name']
    
    if strain not in homopolymer_dict: 
        annotation = "consensus not available"
    
    else:
        sites_in_homopolymers = homopolymer_dict[strain]['all']

        if site in sites_in_homopolymers:
            annotation = "in"
            
        elif site-1 in sites_in_homopolymers or site+1 in sites_in_homopolymers:
            annotation = "adjacent"
            
        else: 
            annotation = "not"
        
    return(annotation)

In [97]:
def return_annotation(df, homopolymer_dict):            
    df['homopolymer'] = df.apply(return_homopolymer_annotation, axis=1, args=[homopolymer_dict])
    return(df)

In [98]:
def add_homopolymer_annotation(fasta_file_path, df):
    homopolymer_dict = return_homopolymer_dict(fasta_file_path)
    df_to_return = return_annotation(df, homopolymer_dict)
    
    return(df_to_return)