<span style="float:left;">Licence CC BY-NC-ND</span><span style="float:right;">François Rechenmann &amp; Thierry Parmentelat&nbsp;<img src="media/inria-25.png" style="display:inline"></span><br/>

# Algorithme UPGMA

In [None]:
# la formule magique pour utiliser print() en python2 et python3
from __future__ import print_function
# pour que la division se comporte en python2 comme en python3
from __future__ import division

### Distance de Needleman et Wunsch

In [None]:
from w4_s09_c1_needleman_wunsh_iter import needleman_wunsch, distance

### Format du fichier d'entrée

In [None]:
cat data/named-species.txt

### L'algorithme UPGMA

In [None]:
def UPGMA(filename, verbose=False):
    """
    Lit un fichier contenant sur chaque ligne 
    un nom d'espèce et un ADN

    Calcule le tableau des distances, 
    puis implémente l'algorithme UPGMA
    
    Renvoie l'arbre de filiation sous forme d'un tuple 
    sur les noms d'espèces
    """
    
    named_adns = {}

    # lire le fichier

    with open(filename) as input:
        for line in input:
            name, adn = line.split()
            named_adns[name] = adn
    
    # on calcule le tableau des distances
    all_distances = {}
    
    for name1, adn1 in named_adns.items():
        for name2, adn2 in named_adns.items():
            if name1 == name2:
                continue
            key = (name1, name2)
            rkey = (name2, name1)
            if rkey in all_distances:
                continue
            all_distances[key] = distance(adn1, adn2)
    
    # la liste des clés de départ
    keys = named_adns.keys()
    
    if verbose:
        print(10*'+', 'Initial distances', all_distances )
    while len(keys) > 1:
        bro1, bro2 = minimal_couple(all_distances, keys)
        new_key = bro1, bro2
        keys.remove(bro1)
        keys.remove(bro2)
        for old_key in keys:
            # dist(F,C),A = (dist F,A + dist C,A) / 2 
            all_distances[ (old_key, new_key) ] = \
              (get_distance(all_distances, bro1, old_key) + \
               get_distance(all_distances, bro2, old_key)) / 2
        keys.append(new_key)
        if verbose:
            print(10*'=', "keys = ", keys)
            print(all_distances)

In [None]:
def get_distance(all_distances, k1, k2):
    if k1 == k2:
        return 0
    elif (k1, k2) in all_distances:
        return all_distances[ (k1, k2) ]
    else:
        return all_distances[ (k2, k1)]

def minimal_couple(distances, keys):
    couple, min_value = None, 10**100
    for k1 in keys:
        for k2 in keys:
            if k1 == k2:
                continue
            if get_distance(distances, k1, k2) < min_value:
                min_value = get_distance(distances, k1, k2)
                couple = k1, k2
    return couple

In [None]:
UPGMA("data/named-species.txt", True)