# Gene Sequences

In [2]:
Set_Strings = [(0, 'CAGCGGGTGCGTAATTTGGAGAAGTTATTCTGCAACGAAATCAATCCTGTTTCGTTAGCTTACGGACTACGACGAGAGGGTACTTCCCTGATATAGTCAC'),
(1, 'CAAGTCGGGCGTATTGGAGAATATTTAAATCGGAAGATCATGTTACTATGCGTTAGCTCACGGACTGAAGAGGATTCTCTCTTAATGCAA'),
(2, 'CATGGGTGCGTCGATTTTGGCAGTAAAGTGGAATCGTCAGATATCAATCCTGTTTCGTAGAAAGGAGCTACCTAGAGAGGATTACTCTCACATAGTA'),
(3, 'CAAGTCCGCGATAAATTGGAATATTTGTCAATCGGAATAGTCAACTTAGCTGGCGTTAGCTTTACGACTGACAGAGAGAAACCTGTCCATCACACA'),
(4, 'CAGTCCGGCGTAATTGGAGAATATTTTGCAATCGGAAGATCAATCTTGTTAGCGTTAGCTTACGACTGACGAGAGGGATACTCTCTCTAATACAA'),
(5, 'CACGGGCTCCGCATCTATTTTGGGTCAAGTTGCATATCAGTCATCGACAATCAAACACTGTTTTGCGGTAGATAAGATACGACTGAGAGAGGACGTTCGCTCGAATATAGTTAC'),
(6, 'CACGGGTCCAATTTTGGAGTAAGTTGATATCGTCACGAAATCAATCCTGTTTCGGTAGTATAGGACTACGACGAGAGAGGACGTTCCTCTGATATAGTTAC'), 
(7, 'GGTCCGTCAATTTTGGAGTAAGTTGATATCGTCACGAAATCAATCCTGTTTCGGTAGTATAGGACTACGACGAGAGAGGACGTTCCTCTGATATAGTTAC'), 
(8, 'CACGGGAATCCGTCAATTTTGGAGTAAGTTGATATCGTCACGAAATCAATCCTGTTTCGGTAGTATAGGACTACGACGAGAGAGGACGTTCCTCTGATATAGTTAC'), 
(9, 'CACGGGTCCGTCAATTTTGGAGTAAGTTGATATCGTCACGAAATCAATCCTGTTTCGGTAGTATAGGACTACGACGAGAGAGGACGTTCCTCTGATATAGTTAC')]

# 1. LCS 

In [3]:
# bottom up dynamic programming
def lcs(X, Y): 
    m = len(X) 
    n = len(Y) 

    # Store LCS between X ending in m and Y ending in n
    L = [[None]*(n + 1) for _ in range(m + 1)] 
  
    for i in range(m + 1): 
        for j in range(n + 1): 
            # if one of the strings is null, LCS is 0
            if i == 0 or j == 0 : 
                L[i][j] = 0
            # if last elem match, LCS += 1
            elif X[i-1] == Y[j-1]: 
                L[i][j] = L[i-1][j-1]+1
            # else take LCS of removing last elem from X or from Y
            else: 
                L[i][j] = max(L[i-1][j], L[i][j-1]) 
    return L

# 2. LCS for each Gene Pair

In [6]:
sim_table = [[0]*(len(Set_Strings)) for _ in range(len(Set_Strings))]

for i in range(len(sim_table)):
    for j in range(len(sim_table)):
        # store LCS by taking last entry in store
        sim_table[i][j] = lcs(Set_Strings[i][1],Set_Strings[j][1])[-1][-1]

In [92]:
def similarityTable(strings):
    sim_table = [[0]*(len(strings)) for _ in range(len(strings))]

    # calculate LCS for all pairs
    for i in range(len(strings)):
        for j in range(len(strings)):
            # if comparing string with itself, leave LCS as 0
            if i!=j:
                # store LCS by taking last entry in store
                sim_table[i][j] = lcs(strings[i][1],strings[j][1])[-1][-1]
    return sim_table

In [93]:
t = similarityTable(Set_Strings)
print(t)

[[0, 74, 76, 73, 82, 84, 89, 87, 91, 91], [74, 0, 67, 72, 79, 71, 69, 68, 71, 71], [76, 67, 0, 65, 69, 82, 82, 81, 84, 84], [73, 72, 65, 0, 80, 72, 68, 67, 69, 69], [82, 79, 69, 80, 0, 74, 74, 73, 75, 75], [84, 71, 82, 72, 74, 0, 95, 93, 97, 97], [89, 69, 82, 68, 74, 95, 0, 97, 101, 101], [87, 68, 81, 67, 73, 93, 97, 0, 100, 100], [91, 71, 84, 69, 75, 97, 101, 100, 0, 104], [91, 71, 84, 69, 75, 97, 101, 100, 104, 0]]


# 3. Manually infer relationships

In [8]:
lengths = []

for i in Set_Strings:
    lengths.append(len(i[1]))
lengths

[100, 90, 97, 96, 95, 114, 101, 100, 106, 104]

In [17]:
sim_percentage = []
for row in range(len(sim_table)):
    sim_p_row = []
    for i in range(len(lengths)):
        if i == row:
            sim_p_row.append(0)
        else:
            sim_p_row.append(round(sim_table[row][i]/max(lengths[i],lengths[row]),2))
    sim_percentage.append(sim_p_row)

In [18]:
sim_percentage

[[0, 0.74, 0.76, 0.73, 0.82, 0.74, 0.88, 0.87, 0.86, 0.88],
 [0.74, 0, 0.69, 0.75, 0.83, 0.62, 0.68, 0.68, 0.67, 0.68],
 [0.76, 0.69, 0, 0.67, 0.71, 0.72, 0.81, 0.81, 0.79, 0.81],
 [0.73, 0.75, 0.67, 0, 0.83, 0.63, 0.67, 0.67, 0.65, 0.66],
 [0.82, 0.83, 0.71, 0.83, 0, 0.65, 0.73, 0.73, 0.71, 0.72],
 [0.74, 0.62, 0.72, 0.63, 0.65, 0, 0.83, 0.82, 0.85, 0.85],
 [0.88, 0.68, 0.81, 0.67, 0.73, 0.83, 0, 0.96, 0.95, 0.97],
 [0.87, 0.68, 0.81, 0.67, 0.73, 0.82, 0.96, 0, 0.94, 0.96],
 [0.86, 0.67, 0.79, 0.65, 0.71, 0.85, 0.95, 0.94, 0, 0.98],
 [0.88, 0.68, 0.81, 0.66, 0.72, 0.85, 0.97, 0.96, 0.98, 0]]

In [21]:
# eliminate all probabiliteies below 75%

trimmed_sim = [[0]*(len(Set_Strings)) for _ in range(len(Set_Strings))]

for row in range(len(sim_percentage)):
    for col in range(len(sim_percentage)):
        if sim_percentage[row][col] <= .75:
            trimmed_sim[row][col] = 0
        else:
            trimmed_sim[row][col] = sim_percentage[row][col]

trimmed_sim

[[0, 0, 0.76, 0, 0.82, 0, 0.88, 0.87, 0.86, 0.88],
 [0, 0, 0, 0, 0.83, 0, 0, 0, 0, 0],
 [0.76, 0, 0, 0, 0, 0, 0.81, 0.81, 0.79, 0.81],
 [0, 0, 0, 0, 0.83, 0, 0, 0, 0, 0],
 [0.82, 0.83, 0, 0.83, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0.83, 0.82, 0.85, 0.85],
 [0.88, 0, 0.81, 0, 0, 0.83, 0, 0.96, 0.95, 0.97],
 [0.87, 0, 0.81, 0, 0, 0.82, 0.96, 0, 0.94, 0.96],
 [0.86, 0, 0.79, 0, 0, 0.85, 0.95, 0.94, 0, 0.98],
 [0.88, 0, 0.81, 0, 0, 0.85, 0.97, 0.96, 0.98, 0]]

In [23]:
# print names


for row in range(len(trimmed_sim)):
    print('sims for', row)
    sims = []
    for col in range(len(trimmed_sim)):
        if trimmed_sim[row][col] != 0:
            sims.append(col)
    print(sims)


sims for 0
[2, 4, 6, 7, 8, 9]
sims for 1
[4]
sims for 2
[0, 6, 7, 8, 9]
sims for 3
[4]
sims for 4
[0, 1, 3]
sims for 5
[6, 7, 8, 9]
sims for 6
[0, 2, 5, 7, 8, 9]
sims for 7
[0, 2, 5, 6, 8, 9]
sims for 8
[0, 2, 5, 6, 7, 9]
sims for 9
[0, 2, 5, 6, 7, 8]


In [38]:
# eliminate all probabiliteies below 80%

trimmed_sim2 = [[0]*(len(Set_Strings)) for _ in range(len(Set_Strings))]

for row in range(len(sim_percentage)):
    for col in range(len(sim_percentage)):
        if sim_percentage[row][col] <= .80:
            trimmed_sim2[row][col] = 0
        else:
            trimmed_sim2[row][col] = sim_percentage[row][col]

# print names
for row in range(len(trimmed_sim2)):
    print('sims for', row)
    sims = []
    for col in range(len(trimmed_sim2)):
        if trimmed_sim2[row][col] != 0:
            sims.append(col)
    print(sims)


sims for 0
[4, 6, 7, 8, 9]
sims for 1
[4]
sims for 2
[6, 7, 9]
sims for 3
[4]
sims for 4
[0, 1, 3]
sims for 5
[6, 7, 8, 9]
sims for 6
[0, 2, 5, 7, 8, 9]
sims for 7
[0, 2, 5, 6, 8, 9]
sims for 8
[0, 5, 6, 7, 9]
sims for 9
[0, 2, 5, 6, 7, 8]


str8 is likely the first generation (origin), since it has the most similarity to all other strings, indicating it is high up the geneology tree.

str9 and str6 are likely the second generation since they have 100% similarity with str8 and have the second and fourth highest similarity to all other strings.

str9's children are likely str0 and str7 since they have 91% and 100% similarity with str9 and have relatively high similarity to all other strings (fifth and sixth).

str6's children are likely str2 and str5 since they have 85% and 83% similarity while having relatively high similarity to all other strings (third and seventh)

str0 is likely the parent of str1 and str4 since it has 82% and 86& similarity with each. 

str2 is likely the parent of str3 since it has 67% similarity with it.

*oops highest similarity is probs middle of the tree, my bad*

                                  str8
              str9                                str6
         str0      str7                    str2         str5
    str1    str4                      str3

# 4. Geneology Algo

In [None]:
def create_tree(sim_table):
    # assume the one with the most LCS with 
    # all the other strings is the origin
    
    
# try with each node as root

# greedily get best tree with that root

# keep best tree

In [52]:
# calculate similarities
def editDist(s1, s2): 
    m = len(s1)
    n = len(s2)
    # store subsolutions
    dp = [[0]*(n+1) for _ in range(m+1)] 
  
    # for every s1 ending in m and s2 ending in n
    for i in range(m+1): 
        for j in range(n+1): 
  
            # if a str is empty, insert all chars of other str
            if i == 0 or j ==0: 
                dp[i][j] = max(i,j)
  
            # if last char same sol is just edit dist
            # of the strings without last char
            elif s1[i-1] == s2[j-1]: 
                dp[i][j] = dp[i-1][j-1] 
  
            # else, sol is minimum dp
            # from all possible operations
            else: 
                dp[i][j] = 1 + min(dp[i][j-1],      # insert 
                                   dp[i-1][j],      # remove 
                                   dp[i-1][j-1])    # replace 
  
    return dp 
  
# Driver program 
str1 = "railway"
str2 = "runway"
  
print(editDist(str1, str2)) 

[[0, 1, 2, 3, 4, 5, 6], [1, 0, 1, 2, 3, 4, 5], [2, 1, 1, 2, 3, 3, 4], [3, 2, 2, 2, 3, 4, 4], [4, 3, 3, 3, 3, 4, 5], [5, 4, 4, 4, 3, 4, 5], [6, 5, 5, 5, 4, 3, 4], [7, 6, 6, 6, 5, 4, 3]]


In [74]:
ed_table = [[0]*(len(Set_Strings)) for _ in range(len(Set_Strings))]

for i in range(len(Set_Strings)):
    for j in range(len(Set_Strings)):
        if i==j:
            ed_table[i][j]=(j,float('inf'))
        else:
            # store edit dist by taking last entry in store
            ed_table[i][j] = (j,editDist(Set_Strings[i][1],Set_Strings[j][1])[-1][-1])
ed_table

[[(0, inf),
  (1, 35),
  (2, 34),
  (3, 41),
  (4, 25),
  (5, 36),
  (6, 20),
  (7, 23),
  (8, 21),
  (9, 19)],
 [(0, 35),
  (1, inf),
  (2, 42),
  (3, 31),
  (4, 19),
  (5, 52),
  (6, 43),
  (7, 43),
  (8, 45),
  (9, 43)],
 [(0, 34),
  (1, 42),
  (2, inf),
  (3, 45),
  (4, 38),
  (5, 38),
  (6, 25),
  (7, 26),
  (8, 25),
  (9, 23)],
 [(0, 41),
  (1, 31),
  (2, 45),
  (3, inf),
  (4, 24),
  (5, 52),
  (6, 48),
  (7, 47),
  (8, 49),
  (9, 47)],
 [(0, 25),
  (1, 19),
  (2, 38),
  (3, 24),
  (4, inf),
  (5, 48),
  (6, 39),
  (7, 38),
  (8, 41),
  (9, 39)],
 [(0, 36),
  (1, 52),
  (2, 38),
  (3, 52),
  (4, 48),
  (5, inf),
  (6, 23),
  (7, 25),
  (8, 22),
  (9, 21)],
 [(0, 20),
  (1, 43),
  (2, 25),
  (3, 48),
  (4, 39),
  (5, 23),
  (6, inf),
  (7, 6),
  (8, 5),
  (9, 3)],
 [(0, 23),
  (1, 43),
  (2, 26),
  (3, 47),
  (4, 38),
  (5, 25),
  (6, 6),
  (7, inf),
  (8, 6),
  (9, 4)],
 [(0, 21),
  (1, 45),
  (2, 25),
  (3, 49),
  (4, 41),
  (5, 22),
  (6, 5),
  (7, 6),
  (8, inf),
  (9, 2)],
 

In [84]:
# store min 3

minThree = []

# take second element for sort
def takeSecond(elem):
    return elem[1]

for row in ed_table:
    sorted_r = sorted(row, key=takeSecond)
    minThree.append([x[0] for x in sorted_r[:3]])
minThree


[[9, 6, 8],
 [4, 3, 0],
 [9, 6, 8],
 [4, 1, 0],
 [1, 3, 0],
 [9, 8, 6],
 [9, 8, 7],
 [9, 6, 8],
 [9, 6, 7],
 [8, 6, 7]]

# 5. Critique of Geneology Algo

**Good**           

**Bad**       

# 6. Complexity of Geneology Algo

# 7. Estimating i/d/c Probability

# 8. HCs

______