# Computed scores for the section on the role of the stemma codicum

In [2]:
from stemmadist.utils.utils import load_tree

## Define the different trees as the Newick format

In [6]:
#stemma_ref = "((F,(X, Y)E)C,(D,(W, Z)H)B)A;"
stemma_ref = "((F,E)C,(D,H)B)A;"

print(load_tree(stemma_ref).ascii_art())

        ┌─F
    ┌─C─┤
    │   └─E
──A─┤
    │   ┌─D
    └─B─┤
        └─H


In [7]:
stemma_1 = "((F,E)B,(D,H)C)A;"
print(load_tree(stemma_1).ascii_art())

        ┌─F
    ┌─B─┤
    │   └─E
──A─┤
    │   ┌─D
    └─C─┤
        └─H


In [8]:
stemma_2 = "((E,C)F,(H,B)D)A;"

print(load_tree(stemma_2).ascii_art())

        ┌─E
    ┌─F─┤
    │   └─C
──A─┤
    │   ┌─H
    └─D─┤
        └─B


## Compute the different scores between the stemma (matrix)

In [9]:
from stemmadist.main import compute_distance, AVAILABLE_DISTANCES

In [11]:
distances = {}
for distance in AVAILABLE_DISTANCES:
    if distance not in ["matching_split", "rf_jaccard"]:
        distances[distance] = {}
        for ix, stemma in enumerate([stemma_1, stemma_2]):
            print("Stemma")
            print(ix + 1)
            print(distance)
            print(compute_distance(stemma_ref, stemma, distance_name=distance))
            distances[distance]["id"+str(ix+1)] = compute_distance(stemma_ref, stemma, distance_name=distance)



Stemma
1
rf
2.0
Stemma
2
rf
0.0
Stemma
1
adjacency_matrix
0.5714285714285714
Stemma
2
adjacency_matrix
0.5714285714285714
Stemma
1
graph_edit_distance
1.0
Stemma
2
graph_edit_distance
3.0
Stemma
1
signed_similarity
0.34285714285714286
Stemma
2
signed_similarity
0.34285714285714286


In [12]:
import pandas as pd


distance_scores = pd.DataFrame(distances)

distance_scores

Unnamed: 0,rf,adjacency_matrix,graph_edit_distance,signed_similarity
id1,2.0,0.571429,1.0,0.342857
id2,0.0,0.571429,3.0,0.342857


In [13]:
print(distance_scores.iloc[:,:].assign(**distance_scores.iloc[:,:].rank(axis = 0, ascending = True).astype(int)).to_latex())

\begin{tabular}{lrrrr}
\toprule
 & rf & adjacency_matrix & graph_edit_distance & signed_similarity \\
\midrule
id1 & 2 & 1 & 1 & 1 \\
id2 & 1 & 1 & 2 & 1 \\
\bottomrule
\end{tabular}



In [10]:
from ete3 import Tree
t1 = Tree("(((F,E)),(((D,H))));")
t2 = Tree("(((D,E)),((F,H)));")

# t1 = Tree('(((a,b),c), ((e, f), g));')
# t2 = Tree('(((a,c),b), ((e, f), g));')


# t1 = Tree("((F,E)C,(D,H)B)A;")
# t2 = Tree("((D,E)C,(F,H)B)A;")
rf, max_rf, common_leaves, parts_t1, parts_t2, _, _ = t1.robinson_foulds(t2, unrooted_trees=True)
print(t1, t2)
print("RF distance is %s over a total of %s" %(rf, max_rf))
print("Partitions in tree2 that were not found in tree1:", parts_t1 - parts_t2)
print("Partitions in tree1 that were not found in tree2:", parts_t2 - parts_t1)

# We can also compare trees sharing only part of their labels

t1 = Tree('(((a,b),c), ((e, f), g));')
t2 = Tree('(((a,c),b), (g, H));')
rf, max_rf, common_leaves, parts_t1, parts_t2 = t1.robinson_foulds(t2)

print(t1, t2)
print("Same distance holds even for partially overlapping trees")
print("RF distance is %s over a total of %s" %(rf, max_rf))
print("Partitions in tree2 that were not found in tree1:", parts_t1 - parts_t2)
print("Partitions in tree1 that were not found in tree2:", parts_t2 - parts_t1)


         /-F
   /- /-|
  |      \-E
--|
  |         /-D
   \- /- /-|
            \-H 
         /-D
   /- /-|
  |      \-E
--|
  |      /-F
   \- /-|
         \-H
RF distance is 2 over a total of 2
Partitions in tree2 that were not found in tree1: {(('D', 'H'), ('E', 'F'))}
Partitions in tree1 that were not found in tree2: {(('D', 'E'), ('F', 'H'))}


ValueError: too many values to unpack (expected 5)