# For each version of gene trees, I will try to find the matching number of dominant quartets wrt the true gene trees

In [1]:
from Quartet import Quartet
import os

In [2]:
base_folder = "/home/navid/comparative_study/Alignments-WeightedQuartets-SpeciesTree-main"

In [3]:
input_wqrts = ["GTF", "GTF-boot", "GTF-bucky-boot"]
REPLICATES = 10
configs = ["100gene-100bp", "100gene-1000bp", "1000gene-100bp", "1000gene-1000bp"]
result_file = f"{base_folder}/Results/data/Quartet-Match/15-taxon/matches.json"
result_df_file = f"{base_folder}/Results/data/Quartet-Match/15-taxon/match-pct.xlsx"
result_df_file_2 = f"{base_folder}/Results/data/Quartet-Match/15-taxon/match-pct-truegt-only.xlsx"
mismatch_df_file = f"{base_folder}/Results/data/Quartet-Match/15-taxon/mismatch-count.xlsx"
mismatch_df_file_2 = f"{base_folder}/Results/data/Quartet-Match/15-taxon/mismatch-count-truegt-only.xlsx"

In [4]:
def get_quartet_set(quartets_file):
    quartets = set()
    with open(quartets_file, "r") as fp:
        lines = fp.readlines()
        for line in lines:
            l = line.split()
            quartets.add(Quartet(l[0]))
    
    return quartets            

In [5]:
# generate the set for species tree quartets first

quartets_file = f"{base_folder}/15-taxon/true-species.qrts"
sq = get_quartet_set(quartets_file)

In [6]:
print(len(sq))

1365


In [9]:
results = {}
for configuration in configs:
    results[configuration] = {}
    tmp = configuration.split("gene")
    true_config = tmp[0]
    
    for input_wqrt in input_wqrts:
        results[configuration][input_wqrt] = {}
        
        GENES = int(true_config)
        for replicate_num in range(1, REPLICATES+1):
            print(f"################ {configuration} {input_wqrt} {replicate_num} ########################")
            true_qfile = f"{base_folder}/15-taxon/Output/true_gt/{true_config}/R{replicate_num}-GTF-true.dqrts"
            if not os.path.exists(true_qfile):
                print("$$$$$$$$$$$$$$ Error: tqrts not found $$$$$$$$$$$$$$$")
                continue
                
            tq = get_quartet_set(true_qfile)
            
            original_qfile = f"{base_folder}/15-taxon/Output/{configuration}/R{replicate_num}-{input_wqrt}.dqrts"
            
            if not os.path.exists(original_qfile):
                print("$$$$$$$$$$$$$$ Error: eqrts not found $$$$$$$$$$$$$$$")
                continue
                    
            oq = get_quartet_set(original_qfile)
            
            if "base" not in results[configuration][input_wqrt]:
                results[configuration][input_wqrt]["base"] = {}
                results[configuration][input_wqrt]["base"]["true_gene_cnt"] = 0
                results[configuration][input_wqrt]["base"]["species_cnt"] = 0
                results[configuration][input_wqrt]["base"]["true_gene_pct"] = 0
                results[configuration][input_wqrt]["base"]["species_pct"] = 0
            
            results[configuration][input_wqrt]["base"]["true_gene_cnt"] += len(oq.intersection(tq)) / REPLICATES
            results[configuration][input_wqrt]["base"]["species_cnt"] += len(oq.intersection(sq)) / REPLICATES
            results[configuration][input_wqrt]["base"]["true_gene_pct"] += len(oq.intersection(tq)) * 100.0 / (len(oq) * REPLICATES)
            results[configuration][input_wqrt]["base"]["species_pct"] += len(oq.intersection(sq)) * 100.0 / (len(oq) * REPLICATES)
            
            print(results)
                

################ 100gene-100bp GTF 1 ########################
{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 118.8, 'species_cnt': 118.6, 'true_gene_pct': 8.703296703296703, 'species_pct': 8.688644688644688}}}}
################ 100gene-100bp GTF 2 ########################
{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 236.8, 'species_cnt': 235.7, 'true_gene_pct': 17.347985347985347, 'species_pct': 17.267399267399266}}}}
################ 100gene-100bp GTF 3 ########################
{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 358.70000000000005, 'species_cnt': 356.29999999999995, 'true_gene_pct': 26.278388278388277, 'species_pct': 26.102564102564102}}}}
################ 100gene-100bp GTF 4 ########################
{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 483.1, 'species_cnt': 480.09999999999997, 'true_gene_pct': 35.39194139194139, 'species_pct': 35.17216117216117}}}}
################ 100gene-100bp GTF 5 ########################
{'100gene-100bp': {'GTF': {

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1009.5000000000001, 'species_cnt': 1005.8, 'true_gene_pct': 73.95604395604397, 'species_pct': 73.6849816849817}}}}
################ 100gene-100bp GTF-bucky-boot 9 ########################
{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1139.0, 'species_cnt': 1136.1, 'true_gene_pct': 83.44322344322346, 'species_pct': 83.23076923076924}}}}
################ 100ge

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 134.8, 'species_cnt': 134.3, 'true_gene_pct': 9.875457875457876, 'species_pct': 9.83882783882784}}}}
################ 100gene-1000bp GTF-bucky-boot 2 

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4, 'species_cnt': 1348.0, 'true_gene_pct': 98.85714285714286, 'species_pct': 98.75457875457876}}}, '1000gene-100bp': {'GTF': {'base': {'true_gene

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4, 'species_cnt': 1348.0, 'true_gene_pct': 98.85714285714286, 'species_pct': 98.75457875457876}}}, '1000gene-100bp': {'GTF': {'base': {'true_gene

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4, 'species_cnt': 1348.0, 'true_gene_pct': 98.85714285714286, 'species_pct': 98.75457875457876}}}, '1000gene-100bp': {'GTF': {'base': {'true_gene

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4, 'species_cnt': 1348.0, 'true_gene_pct': 98.85714285714286, 'species_pct': 98.75457875457876}}}, '1000gene-100bp': {'GTF': {'base': {'true_gene

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4, 'species_cnt': 1348.0, 'true_gene_pct': 98.85714285714286, 'species_pct': 98.75457875457876}}}, '1000gene-100bp': {'GTF': {'base': {'true_gene

In [10]:
print(results)

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8, 'species_cnt': 1209.9, 'true_gene_pct': 88.84981684981685, 'species_pct': 88.63736263736263}}, 'GTF-boot': {'base': {'true_gene_cnt': 1256.3, 'species_cnt': 1253.6, 'true_gene_pct': 92.03663003663004, 'species_pct': 91.83882783882784}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5, 'species_cnt': 1262.6999999999998, 'true_gene_pct': 92.71062271062273, 'species_pct': 92.50549450549451}}}, '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002, 'species_cnt': 1331.4000000000003, 'true_gene_pct': 97.82417582417582, 'species_pct': 97.53846153846153}}, 'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002, 'species_cnt': 1323.1000000000001, 'true_gene_pct': 97.01831501831501, 'species_pct': 96.93040293040292}}, 'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4, 'species_cnt': 1348.0, 'true_gene_pct': 98.85714285714286, 'species_pct': 98.75457875457876}}}, '1000gene-100bp': {'GTF': {'base': {'true_gene

In [11]:
# import json
# with open(result_file, "w") as fp:
#     fp.write(json.dumps(results))

In [7]:
import json
with open(result_file, "r") as fp:
    results = json.load(fp)

In [8]:
results

{'100gene-100bp': {'GTF': {'base': {'true_gene_cnt': 1212.8,
    'species_cnt': 1209.9,
    'true_gene_pct': 88.84981684981685,
    'species_pct': 88.63736263736263}},
  'GTF-boot': {'base': {'true_gene_cnt': 1256.3,
    'species_cnt': 1253.6,
    'true_gene_pct': 92.03663003663004,
    'species_pct': 91.83882783882784}},
  'GTF-bucky-boot': {'base': {'true_gene_cnt': 1265.5,
    'species_cnt': 1262.6999999999998,
    'true_gene_pct': 92.71062271062273,
    'species_pct': 92.50549450549451}}},
 '100gene-1000bp': {'GTF': {'base': {'true_gene_cnt': 1335.3000000000002,
    'species_cnt': 1331.4000000000003,
    'true_gene_pct': 97.82417582417582,
    'species_pct': 97.53846153846153}},
  'GTF-boot': {'base': {'true_gene_cnt': 1324.3000000000002,
    'species_cnt': 1323.1000000000001,
    'true_gene_pct': 97.01831501831501,
    'species_pct': 96.93040293040292}},
  'GTF-bucky-boot': {'base': {'true_gene_cnt': 1349.4,
    'species_cnt': 1348.0,
    'true_gene_pct': 98.85714285714286,
    's

In [9]:
# make a dataframe
import pandas as pd

In [10]:
def get_val(dct, keys):
    for key in keys:
        if key in dct:
            dct = dct[key]
        else:
            return -1
    return dct

In [28]:
data = []
for configuration in configs:
    for ref in ["true_gene", "species"]:
        tmp = [configuration, ref]
        for input_wqrt in input_wqrts:
            c = results[configuration][input_wqrt]["base"]
#             tmp.extend([c[ref+"_cnt"], c[ref+"_pct"]])
            tmp.append(c[ref+"_pct"])
        data.append(tmp)

In [29]:
columns = ["config", "reference"] + input_wqrts
df = pd.DataFrame(data, columns=columns)

In [30]:
df

Unnamed: 0,config,reference,GTF,GTF-boot,GTF-bucky-boot
0,100gene-100bp,true_gene,88.849817,92.03663,92.710623
1,100gene-100bp,species,88.637363,91.838828,92.505495
2,100gene-1000bp,true_gene,97.824176,97.018315,98.857143
3,100gene-1000bp,species,97.538462,96.930403,98.754579
4,1000gene-100bp,true_gene,94.666667,95.194139,96.175824
5,1000gene-100bp,species,94.666667,95.194139,96.175824
6,1000gene-1000bp,true_gene,99.589744,98.102564,99.948718
7,1000gene-1000bp,species,99.589744,98.102564,99.948718


In [31]:
df.to_excel(result_df_file, index=False)

In [32]:
df.query("reference == 'true_gene'").to_excel(result_df_file_2, index=False)

## Count of mismatches

In [11]:
total = len(sq)

In [12]:
total

1365

In [13]:
data = []
for configuration in configs:
    for ref in ["true_gene", "species"]:
        tmp = [configuration, ref]
        for input_wqrt in input_wqrts:
            c = results[configuration][input_wqrt]["base"]
            tmp.append(round(total - c[ref+"_cnt"]))
        data.append(tmp)

In [14]:
columns = ["config", "reference"] + input_wqrts
df2 = pd.DataFrame(data, columns=columns)

In [15]:
df2.to_excel(mismatch_df_file, index=False)
df2.query("reference == 'true_gene'").to_excel(mismatch_df_file_2, index=False)