In [2]:
import pandas as pd
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import math
from ete3 import Tree
from Bio import AlignIO
from Bio.AlignIO.PhylipIO import RelaxedPhylipWriter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
import plotly.express as px

In [3]:
def rf_distance_ete(t1, t2):
    rf, max_rf, common_leaves, parts_t1, parts_t2,discard_t1, discart_t2 = t1.robinson_foulds(t2, unrooted_trees = True)
    if max_rf == 0:
        print("?!")
        return 0
    return rf/max_rf


In [7]:
duplicates = pd.read_csv("duplicates.csv")
lang_data = pd.read_parquet("training_data/lang/binary.parquet")
print(len(duplicates))

18


In [56]:
taxa_diff = []
site_diff = []
pattern_diff = []
rfd = []
difficult_diff = []
for i,row in duplicates.iterrows():
    phlorest_name = row["phlorest_name"]
    lexibank_name = row["lexibank_name"]
    print(phlorest_name)
    print(lexibank_name)
    phlorest_align = AlignIO.read("alignments/lang/bin/" + phlorest_name, "phylip-relaxed")
    lexibank_align = AlignIO.read("alignments/lang/bin/" + lexibank_name, "phylip-relaxed")
    phlorest_taxa = set([phlorest_align[i].id for i in range(len(phlorest_align))])
    lexibank_taxa = set([lexibank_align[i].id for i in range(len(lexibank_align))])
    common_taxa = phlorest_taxa.intersection(lexibank_taxa)
    phlorest_taxa_only = phlorest_taxa.difference(common_taxa)
    lexibank_taxa_only = lexibank_taxa.difference(common_taxa)
    if (len(phlorest_taxa_only) != 0 or len(lexibank_taxa_only) != 0):
        print("Different Taxa:")
        print(phlorest_taxa_only)
        print(lexibank_taxa_only)
    taxa_diff.append(len(phlorest_taxa_only) + len(lexibank_taxa_only))

    phlorest_data = lang_data.loc[lang_data["verbose_name"] == row["phlorest_name"]].iloc[0]
    lexibank_data = lang_data.loc[lang_data["verbose_name"] == row["lexibank_name"]].iloc[0]
    print("Sites")
    print(phlorest_data["num_sites"])
    print(lexibank_data["num_sites"])
    site_diff.append((phlorest_data["num_sites"] - lexibank_data["num_sites"]) / lexibank_data["num_sites"])
    
    print("Patterns")
    print(phlorest_data["num_patterns"])
    print(lexibank_data["num_patterns"])
    pattern_diff.append((phlorest_data["num_patterns"] - lexibank_data["num_patterns"]) / lexibank_data["num_patterns"])
    
    print("RF Distance eval trees")
    t_phlorest = Tree(phlorest_data["newick_eval"])
    t_lexibank = Tree(lexibank_data["newick_eval"])
    rf = rf_distance_ete(t_phlorest, t_lexibank)
    print(rf)
    rfd.append(rf)
    
    print("Difficult")
    print(phlorest_data["difficult"])
    print(lexibank_data["difficult"])
    difficult_diff.append(abs(phlorest_data["difficult"] - lexibank_data["difficult"]))
    print("_______________________________________________________________")
    print("")
    
duplicates["taxa_diff"] = taxa_diff
duplicates["site_diff"] = site_diff
duplicates["pattern_diff"] = pattern_diff
duplicates["rfd_eval"] = rfd
duplicates["difficult_diff"] = difficult_diff
    
    

gray_and_atkinson2003.BIN.cc.phy
dyenindoeuropean.BIN.cc.phy
Different Taxa:
{'Tocharian_A', 'Hittite', 'Tocharian_B'}
{'88', '92', '89', '90', '85', '93', '95', '91', '94', '87', '86'}
Sites
2449
2388
Patterns
1188
2388
RF Distance eval trees
0.19753086419753085
Difficult
0.11816176543209875
0.049344
_______________________________________________________________

koile_et_al2022.BIN.cc.phy
grollemundbantu.BIN.cc.phy
Different Taxa:
set()
{'d305nyangali', 'd308ebodo', 'd304homa1919', 'd20bvamba1919', 'd308bodo2'}
Sites
3859
3853
Patterns
3845
3853
RF Distance eval trees
0.04567307692307692
Difficult
0.4931812000000001
0.5060485974025973
_______________________________________________________________

grollemund_et_al2015.BIN.cc.phy
grollemundbantu.BIN.cc.phy
Sites
3859
3853
Patterns
3859
3853
RF Distance eval trees
0.06413301662707839
Difficult
0.514471
0.5060485974025973
_______________________________________________________________

kitchen_et_al2009.BIN.cc.phy
kitchensemitic.BIN.c

In [14]:
fig = px.histogram(duplicates, x="taxa_diff", nbins=20)
fig.update_layout(xaxis_title = "Number of different taxa", 
                  yaxis_title = "Number of duplicate pairs")


In [51]:
fig = px.scatter(duplicates, x="site_diff", y="pattern_diff")
fig.update_layout(xaxis_title = "Difference in number of sites", 
                  yaxis_title = "Difference in number of patterns")

In [61]:
fig = px.scatter(duplicates, x="difficult_diff", y="rfd_eval")
fig.update_layout(xaxis_title = "Difference in number of sites", 
                  yaxis_title = "Difference in number of patterns")

In [59]:
fig = px.histogram(duplicates, x="difficult_diff", nbins=100)
fig.update_layout(xaxis_title = "Absolute difference of difficulty score", 
                  yaxis_title = "Number of duplicate pairs")

In [58]:
fig = px.histogram(duplicates, x="rfd_eval", nbins=100)
fig.update_layout(xaxis_title = "RF Distance of best trees", 
                  yaxis_title = "Number of duplicate pairs")
fig.show()