In [7]:
import numpy as np
import blosum as bl
import networkx as nx
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict
import random


def alignmentScoreDPG(s1, s2, gapPenalty, match):
    m = np.zeros((len(s1) + 1, len(s2) + 1))
    m[0, 0] = 0
    for i in range(1, len(s1) + 1):
        m[i, 0] = gapPenalty(i)
    for j in range(1, len(s2) + 1):
        m[0, j] = gapPenalty(j)
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):         
            m[i, j] = max(chain((gapPenalty(g) + m[i, j - g] for g in range(1, j + 1)),
                                (gapPenalty(g) + m[i - g, j] for g in range(1, i + 1)),   
                                [(match(s1[i - 1], s2[j - 1]) + m[i - 1, j - 1])]))
    return m
    
def readAlignmentG(s1, s2, m, gapPenalty, match):
    i = len(s1)
    j = len(s2)
    s1a = ""
    s2a = ""
    score = 0
    while i > 0 or j > 0:
        if i > 0 and j > 0 and m[i, j] == m[i - 1, j - 1] + match(s1[i - 1], s2[j - 1]):
            i = i - 1
            j = j - 1
            s1a = s1[i] + s1a
            s2a = (s2[j] if s1[i] == s2[j] else s2[j].upper()) + s2a
            score += match(s1[i], s2[j])
        else:
            foundit = False
            for g in range(1, i + 1):
                if m[i, j] == m[i - g, j] + gapPenalty(g):
                    s1a = s1[i - g:i] + s1a
                    s2a = ('-' * g) + s2a
                    i = i - g
                    score += gapPenalty(g)
                    foundit = True
                    break
            if not foundit:
                for g in range(1, j + 1):
                    if m[i, j] == m[i, j - g] + gapPenalty(g):
                        s1a = ('-' * g) + s1a
                        s2a = s2[j - g:j] + s2a
                        j = j - g
                        score += gapPenalty(g)
                        foundit = True
                        break
            assert foundit
    return (s1a, s2a, score)

def showAlignmentG(s1, s2, gapPenalty, match):
    m = alignmentScoreDPG(s1, s2, gapPenalty, match)
    r = readAlignmentG(s1, s2, m, gapPenalty, match)
    site_map = create_map(r[1])
    print (r[0] + "\n" + r[1] + "\n" + str(r[2]))
    print(site_map)
    return (m, r, site_map)

def affineGap(n, gp = -1, gn = -0.2):
    return gp + (n - 1) * gn

def simpleMatch(a, b):
    return 1 if a == b else -1

def create_map(s):
    m = {}
    j = 1
    for i, char in enumerate(s):
        if char != '-':
            m[i+1] = j
            j+=1
        else:
            m[i+1] = '-'
        
    return m
    

s1 = "AAAGAATTCA"
s2 = "AAATGA"
r = showAlignmentG(s1, s2, affineGap, simpleMatch)




AAAGAATTCA
AAA----TGA
2.4
{1: 1, 2: 2, 3: 3, 4: '-', 5: '-', 6: '-', 7: '-', 8: 4, 9: 5, 10: 6}


In [8]:
data = np.genfromtxt(fname="nextstrain_dengue_denv1_diversity.tsv", delimiter="\t", skip_header=1, filling_values=1)  # change filling_values as req'd to fill in missing values

weights_arr = []
for i,d in enumerate(data):
    weights_arr.append(d[1])
print(weights_arr)
    

[10.0, 21.0, 24.0, 31.0, 9.0, 6.0, 14.0, 11.0, 10.0, 16.0, 15.0, 14.0, 46.0, 32.0, 14.0, 6.0, 6.0, 4.0, 5.0, 1.0, 5.0, 4.0, 9.0, 5.0, 3.0, 3.0, 1.0, 5.0, 3.0, 5.0, 2.0, 3.0, 5.0, 2.0, 8.0, 18.0, 23.0, 25.0, 13.0, 19.0, 18.0, 13.0, 7.0, 18.0, 4.0, 3.0, 6.0, 6.0, 18.0, 4.0, 8.0, 4.0, 11.0, 16.0, 9.0, 6.0, 4.0, 2.0, 6.0, 6.0, 6.0, 13.0, 5.0, 3.0, 22.0, 5.0, 3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 1.0, 7.0, 1.0, 5.0, 1.0, 2.0, 1.0, 13.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 5.0, 3.0, 2.0, 1.0, 2.0, 62.0, 1.0, 13.0, 2.0, 3.0, 1.0, 7.0, 7.0, 3.0, 9.0, 6.0, 2.0, 2.0, 1.0, 4.0, 2.0, 5.0, 13.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 7.0, 1.0, 1.0, 4.0, 2.0, 4.0, 1.0, 8.0, 15.0, 6.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 5.0, 2.0, 1.0, 2.0, 12.0, 1.0, 20.0, 5.0, 1.0, 8.0, 4.0, 1.0, 3.0, 1.0, 9.0, 2.0, 2.0, 1.0, 18.0, 3.0, 6.0, 1.0, 14.0, 1.0, 44.0, 3.0, 1.0, 3.0, 1.0, 1.0, 8.0, 6.0, 1.0, 2.0, 3.0, 1.0, 5.0, 1.0, 1.0, 2.0, 17.0, 1.0, 1.0, 1.0, 2.0, 1.0, 8.0, 4.0, 8.0, 6.0, 1.0, 2.0, 2.0, 2.0, 12

In [9]:
# (TODO) Replace with actual counted values
counter = {'A':
            {
                'A':112,
                'T':34,
                'G':54,
                'C':12
            },
         'T':
            {
                'A':23,
                'T':113,
                'G':44,
                'C':66
            },
         'G':
            {
                'A':14,
                'T':22,
                'G':142,
                'C':65
            },
         'C':
            {
                'A':33,
                'T':48,
                'G':12,
                'C':111
            }
        }

probs = defaultdict(list)

for n in counter:
    total = sum(counter[n].values())
    for m in counter[n]:
        probs[n].append(counter[n][m]/total*100)
print(probs)

defaultdict(<class 'list'>, {'A': [52.83018867924528, 16.037735849056602, 25.471698113207548, 5.660377358490567], 'T': [9.34959349593496, 45.9349593495935, 17.88617886178862, 26.82926829268293], 'G': [5.761316872427984, 9.053497942386832, 58.43621399176955, 26.74897119341564], 'C': [16.176470588235293, 23.52941176470588, 5.88235294117647, 54.41176470588235]})


In [27]:
def mutate(seq, weight_arr, probs):
    seq = seq.upper()
    chosen_loc = random.choices(
  range(len(seq)), weights=weight_arr, k=1)
    
    nucl = seq[chosen_loc[0]]
    print(nucl)
    list_prob = probs[nucl]
    
    nuc_list = ['A','T','G','C']
    
    new_nucl = random.choices(nuc_list, weights=list_prob, k=1)
    
    return new_nucl


# testing functionality of mutute
w = [111,23,45,66]
q = 'agct'

print(mutate(q,w,probs))
    

T
['A']
