# HPLT2 dedup register distribution

In [23]:
LABEL_HIERARCHY = {
    "MT": [],
    "LY": [],
    "SP": ["it"],
    "ID": [],
    "NA": ["ne", "sr", "nb"],
    "HI": ["re"],
    "IN": ["en", "ra", "dtp", "fi", "lt"],
    "OP": ["rv", "ob", "rs", "av"],
    "IP": ["ds", "ed"],
}
LABEL_PARENT = {c: p for p, cs in LABEL_HIERARCHY.items() for c in cs}


def is_error(labels):
    if len(labels) ==1:
        if labels[0] in LABEL_HIERARCHY.keys():
            return False
        else:
            return True
    else:
        return False   # already checked by is_hybdid

def is_hybrid(labels):
    if len(labels) > 2:
        return True
    if len(labels) == 2:
        l1, l2 = labels
        return not (
            l1 in LABEL_PARENT
            and LABEL_PARENT[l1] == l2
            or l2 in LABEL_PARENT
            and LABEL_PARENT[l2] == l1
        )
    return False

In [27]:
import os
import csv

directory="results"
results={}

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        with open(file_path) as csvfile:
            reader = csv.reader(csvfile)
            for line in reader:
                #print(line)
                register = line[0] if len(line)==1 else "no-label" if len(line)==0 else "error"
                # alphabetize:
                if register not in ["error", "no-label"]:
                    register_list = register.split("-")
                    if is_hybrid(register_list):
                        register_list = ["hybrid"]
                    if "hybrid" not in register_list and is_error(register_list):
                        register_list = ["error"]
                    register_list.sort()
                    register = "-".join(register_list)
                if register in results.keys():
                    results[register] += 1
                else:
                    results[register] = 1

In [28]:
import json

print(json.dumps(results, indent=4))

{
    "OP": 35136,
    "OP-rs": 26383,
    "hybrid": 747569,
    "MT": 105402,
    "ID": 210110,
    "IN-dtp": 519885,
    "NA-nb": 154504,
    "NA-sr": 84838,
    "NA-ne": 291931,
    "NA": 43257,
    "IN-lt": 11835,
    "IP-ds": 314294,
    "HI": 34354,
    "OP-rv": 71734,
    "OP-av": 10546,
    "IN": 268530,
    "IP": 29968,
    "IN-ra": 23125,
    "OP-ob": 26831,
    "HI-re": 23860,
    "SP-it": 6324,
    "no-label": 80836,
    "IN-en": 30029,
    "IN-fi": 1052,
    "LY": 14964,
    "SP": 1347,
    "error": 4
}


In [48]:
parsed = []
for main_register in LABEL_HIERARCHY.keys():
    for full_register, value in results.items():
        if main_register in full_register:
            #print(main_register, full_register)
            parsed.append([full_register, main_register, value])

parsed.append(["hybrid", "hybrid", results["hybrid"]])

parsed.append(["error", "error", results["error"]])


In [50]:
import pandas as pd

#df = pd.DataFrame.from_dict(results.items())
df =pd.DataFrame(parsed, columns=["full", "main", "value"])

df

Unnamed: 0,full,main,value
0,MT,MT,105402
1,LY,LY,14964
2,SP-it,SP,6324
3,SP,SP,1347
4,ID,ID,210110
5,NA-nb,,154504
6,NA-sr,,84838
7,NA-ne,,291931
8,,,43257
9,HI,HI,34354


In [51]:
import plotly.express as px 

#fig = px.pie(df, values=1, names=0)
fig = px.sunburst(df, path=['main', 'full'], values='value')

In [52]:
fig.show()