## Phylogenetic info from targets 

### Uses output of 20180223 notebook. 
* Existing targets: try extending the starting coordinate and construct distance matrices, listing non-segregating species, write trees to file
* Maximum informative combinations: 1x, 2x ...

## Next - 20180327 - annotate genes and repeats for amplicons 

In [1]:
from Bio import AlignIO
import numpy as np
import pandas as pd

In [2]:
# Get amplicon and target coordinates from 20180223 notebook
aln = AlignIO.parse('/Users/am60/malaria/20180129_phylo_ampliseq/20180226_amplicons.maf', "maf")
aln = list(aln)
tgt = pd.read_table('/Users/am60/malaria/20180129_phylo_ampliseq/20180226_targets.txt', header=None)
tgt.columns = ['start', 'end']
print(len(aln), len(tgt))

126 126


In [3]:
# long insert amplicons only processed, other 52 markers dropped
long = tgt[tgt.start > 0]
len(long)


74

In [4]:
# Extend variable insert for long insert amplicons until the variability level is above 50% 

def seq_repr(alignment):
    '''
    Given multiple sequence alignment, return first sequence with Ns for ambiguous chars and X's for indels.'''
    seq = ''
    for i in range(alignment.get_alignment_length()):
        col = alignment[:, i]
        if '-' in col: # indel stronger than substitution
            seq += 'X'
        elif len(set(col)) == 1:
            seq += col[0]
        else:
            seq += 'N'
    return seq

def prop_var(seq):
    '''
    Return propotion of variable nucleotides in seq_repr of alignment'''
    return (seq.count('N') + seq.count('X'))/len(seq)

def extend_variable(seq, start, end, min_ambig=0.5):
    '''
    Extends flanks of variable insert. Works only if seq[0:start] and seq[end:len(seq)] are conserved.
    This should be true for pre-selected amplicons (see 20180223).
    Parameters - sequence, start and end of variable target to be extended,
    minimum proportion of variable sites for extended region. '''
    var_start = False
    for i in range(0, start - 1):
        if prop_var(seq[i:start]) >= min_ambig:
            #print(seq[i:start])
            var_start = True
        if var_start:
            if seq[i] in 'NX':
                ext_start = i
                #print(ext_start)
                break
    else:
        ext_start = start
    
    var_end = False
    for i in reversed(range(end + 1,len(seq))):
        if prop_var(seq[end:i]) >= min_ambig:
            #print(seq[end:i])
            var_end = True
        if var_end:
            if seq[i - 1] in 'NX':
                ext_end = i
                #print(ext_end)
                break
    else:
        ext_end = end
    
    return (ext_start, ext_end)

long_alns = []
short_alns = []
for (i, a) in enumerate(aln):
    seq = seq_repr(a)
    if tgt.start[i] > 0: # insert was inferred previously
        (start, end) = extend_variable(seq, tgt.start[i], tgt.end[i])
        insert = seq[start:end]
        long_alns.append([i, a, seq, start, end, insert])
    else:
        short_alns.append([i, a, seq])
                         
long_alns[0]

[1,
 <<class 'Bio.Align.MultipleSeqAlignment'> instance (21 records of length 405, SingleLetterAlphabet()) at 10e1e99b0>,
 'AGNNGNACCGANTCGTCCAANCAGCTGGACGCGAAGATGAAGCACAGTAAGGANCTGGACNNGXXXGNNGCNGGNGGXXXCCCNGGNGACCACGGCTATAACCCGAACCACTATATGCGTTCNATTCCAGGTCTNATNTATCACGGCACGCAATCGAGCACNTCNTCCGATTTNTCGCCGATGTCCGAACAAAAGTCNTTACCNCGGCGTGGCCGTTCAAGGTACCATCATCTTCANCTTCATANCACTAACACCACACCANGNCACAAGNCCTCCAAAGCTCAGTCACCANTAGCGTCNCCCCGTAGTAGNNNXXXCCNGXXXXXXXXXNACNAXXXNAACGTNCCGATACAGCTTCCCCGCATGCCGTCCCAGTTTCGNCCNATNCACAGTAGTNGAACNATN',
 311,
 345,
 'NNNXXXCCNGXXXXXXXXXNACNAXXXNAACGTN']

In [5]:
# Add identity information to long insert amplicons

def identical_clusters(aln):
    '''
    Given alignment, return list of sets with species IDs with identical sequences'''
    from Bio.Phylo.TreeConstruction import DistanceCalculator

    ids = [set()]
    dm = DistanceCalculator('identity').get_distance(aln)
    dm.names = [n.split('.')[0] for n in dm.names]
    for i in range(len(dm)):
        for j in range(i + 1, len(dm)):
            if dm[i,j] == 0:
                n1 = dm.names[i]
                n2 = dm.names[j]
                for cl in ids:
                    if (n1 in cl):
                        if (n2 in cl):
                            break
                        if (n2 not in cl):
                            cl.add(n2)
                            break
                else:
                    ids.append(set((n1, n2)))
        
    return ids[1:]

 
    
for aln in long_alns:
    target = aln[1][:, aln[3]:aln[4]]
    aln.append(identical_clusters(target))

print(long_alns[-1])

[124, <<class 'Bio.Align.MultipleSeqAlignment'> instance (21 records of length 536, SingleLetterAlphabet()) at 10e5ef438>, 'GANGAGCTGCGNGCCGANCAGGACCANGCCCAGACNCAGGAGAAGCTGCGCAAGGCNCTNGAGCAGCAGATCAAGGANCTNCAGGTCCGCNTGGANGANGCNGANTCGAANGCNCTNAAGGGNGGNAAGAAGGCNATNCAGAAGCTGGANCAGCGCGTNCGCGAGCTNGANTCNGANCTGGACAGCGANCAGNGACGACATGCCGANGCCCAGAAGAACCTGCGCAAGTCGGAGCGNCGCATCAAGGAGCTGANCTTCCAGTCGGAGGANGACCGCAAGAACCACGANCGCATGCAGGATCTGGTNGANAAGCTGCAGCAGAAGATCAAGACTTACAAGAGGCAGATNGAGGANGCNGANGAGATNGCCGCCCTNAACNTGGCCAAGTTCCGNAAGGCCCAGCAGGAGCTNGAGGANGCCGAGGAGCGNGCNGANATTGCCGANCAAGCTGCCACCAAATTCCGNACCAAGGGAGGACGTGCCGGTTCCGTNCAGCGTGGTGCTAGCCCAGCAGTAAGTACCATNTXANAXNXXXX', 96, 171, 'GANGCNGANTCGAANGCNCTNAAGGGNGGNAAGAAGGCNATNCAGAAGCTGGANCAGCGCGTNCGCGAGCTNGAN', [{'AgamM1', 'AgamS1', 'AgamP3'}, {'AmelC1', 'AsteI2', 'AsteS1'}, {'AmacM1', 'AminM1'}]]


In [19]:
# test tree construction
from Bio import Phylo
def phylo_tree(aln):
    '''
    Given alignment, return NJ tree in nwk format'''
    from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)     
    dm.names = [n.split('.')[0] for n in dm.names]
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return tree

for aln in long_alns:
    t = phylo_tree(aln[1])
    with open('/Users/am60/malaria/20180129_phylo_ampliseq/20180309_amplicon_trees.txt', 'a') as o:
        Phylo.write(t, o, 'newick')
   
    

In [11]:
# list unidentifiable species for each marker
unids = [sum([len(id) for id in aln[-1]]) for aln in long_alns]
print(unids)
# best marker:
# Gene: AGAP010750, ryanodine receptor 2, short intron, 
# 3L:9810324-9810546, Ag1000g variable 9810444 and 9810445
print(long_alns[unids.index(4)])
print(long_alns[unids.index(4)][1][0].id)
print(long_alns[unids.index(4)][1][0].annotations)

[16, 16, 16, 19, 14, 15, 20, 9, 13, 16, 16, 10, 12, 15, 12, 18, 18, 10, 18, 11, 16, 14, 18, 7, 6, 13, 15, 12, 13, 19, 8, 16, 13, 20, 6, 7, 21, 16, 21, 17, 15, 9, 15, 8, 8, 11, 15, 18, 16, 17, 4, 12, 16, 20, 21, 10, 21, 13, 13, 17, 18, 17, 21, 19, 20, 14, 11, 15, 10, 15, 21, 21, 19, 8]
[81, <<class 'Bio.Align.MultipleSeqAlignment'> instance (21 records of length 263, SingleLetterAlphabet()) at 112bd1550>, 'GTNAAGAAACGNTGNGACAGNATGTTCCANAAGGAGGAAAGCTTGCGTGGTTTCCTCAAGGTNNGXXXXXXXXXXXXNNNNNNXXXXXXXNNNNNXXXXXXXXXXXXXXXXXXNNNXXXXXXXXXNNNXXXXXXXXXXXXXXXXXXXXXXXXNNNNNNNXXNXNXXXXXNNXNAGTCTTCCACNGATGATACCTCCCANATTGAAGCNCANATCCAGGAGGACTGGCAACTGCTGGTNCGCGANATNTACTCGTTCTANCCNN', 62, 173, 'NNGXXXXXXXXXXXXNNNNNNXXXXXXXNNNNNXXXXXXXXXXXXXXXXXXNNNXXXXXXXXXNNNXXXXXXXXXXXXXXXXXXXXXXXXNNNNNNNXXNXNXXXXXNNXN', [{'AgamP3', 'AgamM1'}, {'AsteS1', 'AsteI2'}], [{'AgamP3', 'AgamM1'}, {'AsteS1', 'AsteI2'}]]
AgamP3.chr3L
{'start': 9810324, 'size': 222, 'strand': 1, 'srcSize': 41963435}


In [12]:
print(long_alns[unids.index(4)][1][:,62:173])

SingleLetterAlphabet() alignment with 21 rows and 111 columns
AAGT-GT--TGAAGCATACGAGATTCCTCCGGG-----TCTTG-...TGT AgamP3.chr3L
AAGT-GT--TGAAGCATACGAGATTCCTCCGGG-----TCTTG-...TGT AgamS1.chrscf_1106392397136
AAGT-GT--TGAAGCATACGAGATTCCTCCGGG-----TCTTG-...TGT AgamM1.chrscf_1925491374
AAGT-GT--TGAAGCATACGAGATTCCTCCGGGCAACATCATT-...TGC AmerM1.chrsupercont1.107
AAGT-GT--TGAAGCATACGAGATTCCTCCGGG-----TCTTG-...TGT AaraD1.chrKB704440
GAGT-GT--TGAAGCATACGAGATTCCTCCGGGTAACATCATG-...TGC AquaS1.chrKB667589
AAGT-GT--TGAAGCATACTAGATTCCTCCGGG------CATT-...TGC AmelC1.chrsupercont1.458
AAGT-G---TGAAGCGTTTCA---CCTTCCGAAAGACATCATT-...TGC AchrA1.chrKB683868
GAGT-G---------GGGCGATCTTAATCAGAACGTCGTCATG-...CAC AepiE1.chrKB670503
GAGT-CT--GAAGTTACTTGAGCTTTTTTTCGA-----------...TTC AminM1.chrKB663666
GAGC-CT--AGAGTAACTTAACC-----TTAGA-----------...TAC AculA1.chrsupercont1.163
GAGC-------AGGAACTCAATTTCTTTTAGGT-----------...TTC AfunF1.chrKB668444
GAGT-TG--GCAGTGTGAAGATATTTCTAACGA-----------...TTC AsteS1.chrKB665287
G

In [13]:
def gapped_coord(aln, coord, ref=0):
    '''
    Transforms coordinate in maf alignment according to number of gaps in ref (i-th seq in alignment)
    '''
    ngaps = str(aln[ref, :coord].seq).count('-')
    return aln[ref].annotations['start'] + coord - ngaps

len(long_alns)

74

In [22]:
# generate statistics for long inserts - use pd.data_frame instead
long_aln_stats = []
for aln in long_alns:
    long_aln_stats.append({
        'seqid': aln[1][0].id,
        'start': aln[1][0].annotations['start'],
        'end': gapped_coord(aln[1], aln[1].get_alignment_length()),
        'aligned_len': aln[1].get_alignment_length(),
        'snvs': aln[2].count('N'),
        'indels': aln[2].count('X'),
        'target_start': gapped_coord(aln[1], aln[3]),
        'target_end': gapped_coord(aln[1], aln[4]),
        'target_aligned_len': aln[4] - aln[3],
        'target_snvs': aln[5].count('N'),
        'target_indels': aln[5].count('X'),
        'unid_species': aln[6]
    })
las = pd.DataFrame(long_aln_stats)

In [23]:
las

Unnamed: 0,aligned_len,end,indels,seqid,snvs,start,target_aligned_len,target_end,target_indels,target_snvs,target_start,unid_species
0,405,5759934,21,AgamP3.chr2L,43,5759550,34,5759874,15,8,5759855,"[{AquaS1, AgamP3, AaraD1, AgamM1, AmelC1, Agam..."
1,275,6534240,1,AgamP3.chr2L,36,6533965,43,6534097,0,13,6534054,"[{AquaS1, AgamP3, AepiE1, AaraD1, AgamM1, Amel..."
2,240,6534337,0,AgamP3.chr2L,34,6534097,32,6534272,0,8,6534240,"[{AfunF1, AquaS1, AgamP3, AsteS1, AmacM1, Aara..."
3,220,6993612,7,AgamP3.chr2L,28,6993399,37,6993506,0,13,6993469,"[{AquaS1, AgamP3, AaraD1, AgamM1, AgamS1, Amer..."
4,225,13325363,16,AgamP3.chr2L,23,13325151,29,13325273,6,7,13325250,"[{AquaS1, AgamP3, AaraD1, AgamM1, AmelC1, Achr..."
5,218,13862997,105,AgamP3.chr2L,13,13862846,93,13862950,93,0,13862913,"[{AquaS1, AgamP3, AsteS1, AaraD1, AgamM1, Amel..."
6,197,14603959,10,AgamP3.chr2L,24,14603764,17,14603871,0,5,14603854,"[{AquaS1, AgamP3, AepiE1, AaraD1, AgamM1, Amel..."
7,194,16043069,0,AgamP3.chr2L,32,16042875,72,16043000,0,18,16042928,"[{AquaS1, AgamP3, AaraD1, AgamM1, AmelC1, Agam..."
8,216,16193829,0,AgamP3.chr2L,33,16193613,57,16193769,0,14,16193712,"[{AquaS1, AgamP3, AaraD1, AgamM1, AmelC1, Achr..."
9,198,16193967,0,AgamP3.chr2L,29,16193769,58,16193883,0,15,16193825,"[{AgamP3, AsinS1, AculA1}, {AquaS1, AaraD1, Ag..."


In [27]:
las = las[['seqid', 'start', 'end', 'aligned_len', 'snvs', 'indels', 'target_start', 'target_end', 
     'target_aligned_len', 'target_snvs', 'target_indels', 'unid_species']]

In [28]:
# write files
stats_file = '/Users/am60/malaria/20180129_phylo_ampliseq/20180324_long_ampl_stats.csv'
aln_file = '/Users/am60/malaria/20180129_phylo_ampliseq/20180324_long_ampl.maf'
las.to_csv(stats_file)

with open(aln_file, "w") as o:
    for aln in long_alns:
        AlignIO.write(aln[1], o, "maf")

In [196]:
# Pairwise combinations of long_alns to produce favorable level of resolution
max_identical = 5
def concatenate_alns(alns):
    '''
    Given list of alignments with seqrecord.id=species.seqid
    yield concatenated alignment with seqrecord.id=species'''
    for aln in alns:
        for s in aln:
            s.id = s.id.split('.')[0]
    concat_aln = alns[0]
    if len(alns) > 1:
        for i in range(1, len(alns)):
            concat_aln += alns[i]
    return concat_aln

pair_alns = []
for (i, aln1) in enumerate(long_alns):
    for aln2 in long_alns[i+1:]:
        tgt1 = aln1[1][:, aln1[3]:aln1[4]]
        tgt2 = aln2[1][:, aln2[3]:aln2[4]]
        concat_tgt = concatenate_alns([tgt1, tgt2])
        idc = identical_clusters(concat_tgt)
        if sum([len(id) for id in idc]) <= max_identical and aln2[0] != 81:
            pair_alns.append([aln1[0], aln2[0], idc])

print(pair_alns[-5:])


[[116, 124, [{'AgamP3', 'AgamM1', 'AgamS1'}, {'AsteS1', 'AsteI2'}]], [117, 124, [{'AgamP3', 'AgamM1', 'AgamS1'}, {'AsteS1', 'AsteI2'}]], [118, 124, [{'AgamP3', 'AgamM1', 'AgamS1'}, {'AsteS1', 'AsteI2'}]], [119, 124, [{'AgamP3', 'AgamM1', 'AgamS1'}, {'AsteS1', 'AsteI2'}]], [121, 124, [{'AgamP3', 'AgamM1', 'AgamS1'}, {'AsteS1', 'AsteI2'}]]]


In [197]:
print(len(pair_alns))

121


In [171]:
# For short alignments, produce identity information for each variable position and for each indel stretch
# for aln in short_alns:
#     indel = False
#     for (i, nt) in enumerate(aln[2]):
#         if (i > 50) and (i < len(aln[2]) - 50):
#             if nt == 'N':
#                 print(i, nt, aln[2])
#                 break
#             elif nt == 'X':
                
#     break

53 N AGNNGNACCGANTCGTCCAANCAGCTGGACGCGAAGATGAAGCACAGTAAGGANCTGGACNNGXXXGNNGCNGGNGGXXXCCCNGGNGACCACGGCTATAACCCGAACCACTATATGCGTTCNATTCCAGGTCTNATNTATCACGGCACGCAATCGAGCACNTCNTCCGATTTNTCGCCGATGTCCGAACAAAAGTCNTTACCNCGGCGTGGCCGTTCAAGGTACCATCATCTTCANCTTCATANCACTAACACCACACCANGNCACAAGNCCTCCAAAGCTCAGTCACCANTAGCGTCNCCCCGTAGTAGNN
