Desired output format:
- target and margins - ID Chr Start Stop
- confounding SNPs in flanks - ID Chr	Start	Stop	ref	alt	Source	type

In [1]:
import pandas
import pybedtools
from collections import defaultdict
from Bio import AlignIO
# for MSA to variant conversion example
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

## Inputs

In [12]:
# input
AMPL_FILE = "data/20180706_comb_ampl.maf"
PRIORITY_FILE = "data/20180710_priority_ampl_data.xlsx"
PRIORITY_TYPES = ('X', 'Var', 'Cons', 'Other') # sheet names in priority file
GENOME_FA = '../../../data/genome/AgamP3/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa'
# output
INTERVAL_FILE = "data/20180716_ampl_intervals.xlsx"
ALL_SNP_FILE = "data/20180716_ampl_all_snps.xlsx"
HF_SNP_FILE = "data/20180716_ampl_highfreq_snps.xlsx"

In [3]:
# alignments to list
ampl_seq = list(AlignIO.parse(AMPL_FILE, "maf"))
len(ampl_seq)

591

In [4]:
# metadata to dict of dataframes
ampl_data = dict()
for p in PRIORITY_TYPES:
    ampl_data[p] = pandas.read_excel(PRIORITY_FILE, sheet_name=p)
ampl_data['X']

Unnamed: 0,chr,start,end,seqid,num_aligned_species,unaligned_species,aligned_len,len_agam,total_indels,total_snvs,...,CpipJ2_identity,CpipJ2_q_span,CpipJ2_s_span,CpipJ2_expect,BDGP6_hits,BDGP6_length,BDGP6_identity,BDGP6_q_span,BDGP6_s_span,BDGP6_expect
575,X,876693,876909,AgamP3.chrX,20,AgamM1,277,216,122,32,...,,,,,,,,,,
576,X,961289,961516,AgamP3.chrX,21,,231,227,7,42,...,,,,,,,,,,
577,X,1026174,1026456,AgamP3.chrX,21,,282,282,0,55,...,0.900709,1-282,supercont3.123:864150-864431,2.2743900000000002e-99,,,,,,
579,X,1815894,1816086,AgamP3.chrX,18,AgamM1;AsinS1;AdarC2,192,192,5,28,...,,,,,,,,,,
582,X,10000121,10000346,AgamP3.chrX,20,AgamM1,225,225,0,32,...,,,,,,,,,,
587,X,18742149,18742401,AgamP3.chrX,20,AchrA1,276,252,24,47,...,,,,,,,,,,


## File with intervals

In [5]:
# example of interval data
ampl_data['X'][['chr', 'start', 'end', 'target_start', 'target_end']]

Unnamed: 0,chr,start,end,target_start,target_end
575,X,876693,876909,876749,876867
576,X,961289,961516,961335,961447
577,X,1026174,1026456,1026221,1026414
579,X,1815894,1816086,1815984,1816045
582,X,10000121,10000346,10000169,10000304
587,X,18742149,18742401,18742234,18742321


In [6]:
writer = pandas.ExcelWriter(INTERVAL_FILE)
for p in PRIORITY_TYPES:
    ampl_data[p][['chr', 'start', 'end', 'target_start', 'target_end']].to_excel(writer, p)
writer.save()

## Parse variants from multiple sequence alignment

In [7]:
# check genomic coordinates
def coordinate_check(alignment, meta, genome):
    '''
    Given alignment, series of metadata (chr, start, end used) and corresponding genome,
    assert that alignment and metadata have the same underlying sequence
    '''
    bed_str = '\t'.join([str(meta['chr']), str(meta['start']), str(meta['end'])])
    bed = pybedtools.BedTool(bed_str, from_string=True)
    a = bed.sequence(fi=genome)
    genome_seq = open(a.seqfn).read().split('\n')[1]
    aln_seq = str(alignment[0].seq).replace("-","")
    pybedtools.cleanup(remove_all=True)
    if genome_seq != aln_seq:
        raise ValueError('Unequal sequences:\n{}\n{}\n{}'.format(genome_seq, aln_seq, alignment[0].seq))
    return 0
    
coordinate_check(ampl_seq[575], ampl_data['X'].loc[575], GENOME_FA)

0

In [8]:
def msa_to_var(alignment, ref=0):
    '''
    Transform MultipleSeqAlignment into list of variants relative to specified reference. 
    Info for each variant:
    chr pos ref alt alt_spp num_aligned alt_freq
    
    Caveats:
    - deletions are per-base denoted as '-', while insertions are relative to previous reference base
    - inserts at start and end of reference sequence are not displayed
    - substitutions prior to insers are reported as separate variants
    '''
    
    def var_data(alts, seqid, pos, ref_allele, num_genomes):
        alt_alleles = ';'.join(alts.keys())
        alt_genomes = ';'.join([','.join(alts[k]) for k in alts.keys()])
        num_alt = sum((len(v) for v in alts.values()))
        percent_alt = num_alt / num_genomes
        return {
            'chr': seqid,
            'pos': pos,
            'ref': ref_allele,
            'alt_alleles': alt_alleles,
            'alt_genomes': alt_genomes,
            'num_genomes': num_genomes,
            '%_alt': percent_alt
        }
            
    variants = []
    
    seqid = alignment[ref].id # reference chromosome
    pos = alignment[ref].annotations['start'] # position in reference genome
    ref_allele = '' # current reference nucleotide
    inserts = dict() # insert sequences for non-reference
    genomes = [s.id.split('.')[0] for s in alignment] # genome names from seqids 
    # (expected format <genome>.<chromosome>)
    num_genomes = len(alignment)
    for i in range(alignment.get_alignment_length()):
        # extract alignment column
        col = alignment[:, i]
        # insertion into reference - only collect insert data
        if col[ref] == '-':
            if ref_allele == '':
                # beginning of reference sequence - skip
                continue
            # first base of insert - per-genome genotypes with previous base and insert (if any)
            if len(inserts) == 0:
                prev_col = alignment[:, i - 1]
                for (j, gt) in enumerate(col):
                    inserts[genomes[j]] = prev_col[j] + gt.replace("-", "")
            # further inserts - collect insert genotypes
            else:
                for (j, gt) in enumerate(col):
                    inserts[genomes[j]] += gt.replace("-", "")
            continue
            
        # insert variants for previous reference base
        if len(inserts) > 0:
            # set reference allele
            ref_insert = inserts[genomes[ref]]
            assert len(ref_insert) == 1
            # collect alt genotypes from insert dict
            alt_inserts = defaultdict(list)
            for (k, v) in inserts.items():
                if v != ref_insert:
                    alt_inserts[v].append(k)
            # previous base 
            variants.append(var_data(alt_inserts, seqid, pos - 1, ref_insert, num_genomes))
            inserts = dict()
            
        # set reference allele
        ref_allele = col[ref]
        # collect alt genotypes (subst & del)
        alts = defaultdict(list)
        for (j, gt) in enumerate(col):
            if gt != ref_allele:
                alts[gt].append(genomes[j])
        # write alt genotypes (subst & del)
        if len(alts) > 0:
            # summary over alt alleles
            variants.append(var_data(alts, seqid, pos, ref_allele, num_genomes))
        pos += 1
        
    return pandas.DataFrame(variants, columns=['chr', 'pos', 'ref', 'alt_alleles', 
                                               'alt_genomes', 'num_genomes', '%_alt'])
        
# test example
a = SeqRecord(Seq("-AAC-T-", generic_dna), id="Alpha.chr1", annotations={'start':100})
b = SeqRecord(Seq("CC-CGTA", generic_dna), id="Beta.chr2", annotations={'start':234})
c = SeqRecord(Seq("GTAGGTA", generic_dna), id="Gamma.chr3", annotations={'start':345})
align = MultipleSeqAlignment([a, b, c])
print(align)
msa_to_var(align)

DNAAlphabet() alignment with 3 rows and 7 columns
-AAC-T- Alpha.chr1
CC-CGTA Beta.chr2
GTAGGTA Gamma.chr3


Unnamed: 0,chr,pos,ref,alt_alleles,alt_genomes,num_genomes,%_alt
0,Alpha.chr1,100,A,T;C,Gamma;Beta,3,0.666667
1,Alpha.chr1,101,A,-,Beta,3,0.333333
2,Alpha.chr1,102,C,G,Gamma,3,0.333333
3,Alpha.chr1,102,C,CG;GG,Beta;Gamma,3,0.666667


In [9]:
# real data examples
print(ampl_seq[575][0].seq)
msa_to_var(ampl_seq[577]).tail()

ATGACCAGCGCAGACTTTCAGGCCATGATACCGTCTCACTTTATCAGCGGCGGCGGGCGGCAGGTAACC---GTGGACAAGTCTGACCACGGCCCGT----------------------------------------------------------CGGTAACGGTGACGGTGCAGAAGGCCGTGCCGGACATGCCGATGTTGCCGCCGGCACCGACCGAGCTCGGCACCGTCACGGAAACGATTACCAAGTCAACGTTCACTGAGACGGTAATGACC


Unnamed: 0,chr,pos,ref,alt_alleles,alt_genomes,num_genomes,%_alt
50,AgamP3.chrX,1026422,C,T,AfunF1,21,0.047619
51,AgamP3.chrX,1026425,G,A,"AchrA1,AminM1,AculA1,AfunF1,AsteS1,AsteI2,Amac...",21,0.380952
52,AgamP3.chrX,1026431,C,T,"AchrA1,AdarC2",21,0.095238
53,AgamP3.chrX,1026449,G,A,"AculA1,AfunF1,AsteS1,AsteI2",21,0.190476
54,AgamP3.chrX,1026455,C,T;A,AculA1;AchrA1,21,0.095238


In [10]:
def confounding_vars(alignment, meta, genome):
    '''
    Given alignment, series of metadata (chr, start, end, target_start, target_end used) and corresponding genome,
    yield confounding SNPs in flanking sequence.
    '''
    # does metadata have correct genomic coordinates for alignment? 
    coordinate_check(alignment, meta, genome)
    # get variant
    v = msa_to_var(alignment)
    # filter flanking variants
    return v[(v['pos'] < meta['target_start']) | (v['pos'] >= meta['target_end'])]

print(ampl_seq[576][0].annotations['start'])
print(ampl_seq[576][0].seq)

confounding_vars(ampl_seq[576], ampl_data['X'].loc[576], GENOME_FA)

961289
T-GCAGGAAACTAAGCGGCCTTCAAAGGAGATGCAAGTTACTATCGCTCGTCAGCTGGGACTGGAGCCCACGACGGTCGGGAACTTTTTCATGAACGCACGGCGACGCTCGATGGACAAATGGAAGGACGAG---TCGATGAAGGGCCGGAGCAGCAGTAGTGACAATGTTATACTAGAGCACCTGAAGGACGATTCGTCCGACGAGAACCTGGGCTACGATAGTGCGTTC


Unnamed: 0,chr,pos,ref,alt_alleles,alt_genomes,num_genomes,%_alt
0,AgamP3.chrX,961289,T,TC,AfarF1,21,0.047619
1,AgamP3.chrX,961290,G,T;C;A,AalbS1;AmacM1;AchrA1,21,0.142857
2,AgamP3.chrX,961291,C,T,"AfarF1,AdarC2,AalbS1",21,0.142857
3,AgamP3.chrX,961311,A,T;C,"AquaS1;AdarC2,AalbS1",21,0.142857
4,AgamP3.chrX,961317,G,A,"AdarC2,AalbS1",21,0.095238
5,AgamP3.chrX,961332,C,T,AculA1,21,0.047619
39,AgamP3.chrX,961449,C,A,"AdarC2,AalbS1",21,0.095238
40,AgamP3.chrX,961452,T,C,"AdarC2,AalbS1",21,0.095238
41,AgamP3.chrX,961454,T,C,"AdarC2,AalbS1",21,0.095238
42,AgamP3.chrX,961479,T,C,"AdirW1,AatrE1",21,0.095238


## Files with confounding variants

In [13]:
all_writer = pandas.ExcelWriter(ALL_SNP_FILE)
hf_writer = pandas.ExcelWriter(HF_SNP_FILE)
for p in PRIORITY_TYPES:
    ampl_vars = []
    for i in ampl_data[p].index:
        avd = confounding_vars(ampl_seq[i], ampl_data[p].loc[i], GENOME_FA)
        avd.index = ['{}_{}'.format(i, j) for j in range(len(avd))]
        avd['chr'] = avd['chr'].str.slice(10)
        ampl_vars.append(avd)
    av_pd = pandas.concat(ampl_vars, axis=0)
    av_pd.to_excel(all_writer, p)
    # over 1 genome wi
    av_pd[(av_pd['alt_genomes'].str.count(';') > 0) |
          (av_pd['alt_genomes'].str.count(',') > 0)].to_excel(hf_writer, p)
all_writer.save()
hf_writer.save()

In [None]:
## 