# HPLT2 dedup register distribution

In [1]:
LABEL_HIERARCHY = {
    "MT": [],
    "LY": [],
    "SP": ["it"],
    "ID": [],
    "NA": ["ne", "sr", "nb"],
    "HI": ["re"],
    "IN": ["en", "ra", "dtp", "fi", "lt"],
    "OP": ["rv", "ob", "rs", "av"],
    "IP": ["ds", "ed"],
    "no-label":[],
}
LABEL_PARENT = {c: p for p, cs in LABEL_HIERARCHY.items() for c in cs}

ALL_LABELS = [i for i in LABEL_HIERARCHY.keys()] + [i for i in LABEL_PARENT.keys()]

def is_error(labels):
    if len(labels) ==1:
        if labels[0] in LABEL_HIERARCHY.keys():
            return False
        else:
            return True
    else:
        return False   # already checked by is_hybdid

def is_hybrid(labels):
    if len(labels) > 2:
        return True
    if len(labels) == 2:
        l1, l2 = labels
        return not (
            l1 in LABEL_PARENT
            and LABEL_PARENT[l1] == l2
            or l2 in LABEL_PARENT
            and LABEL_PARENT[l2] == l1
        )
    return False

In [2]:
import os
import csv

directory="results"
results={}

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        with open(file_path) as csvfile:
            reader = csv.reader(csvfile)
            for line in reader:
                #print(line)
                register = []
                if len(line)==1:
                    register=line[0]  # all written on one line
                elif len(line)==0:
                    register="no-label" 
                else:
                    register="error"
                # alphabetize:
                if register not in ["error", "no-label"]:
                    register_list = register.split("-")
                    for r_candidate in register_list.copy():
                        if r_candidate not in ALL_LABELS:
                            register_list=["error"]
                    if is_hybrid(register_list):
                        register_list = ["hybrid"]
                    if "hybrid" not in register_list and is_error(register_list):
                        register_list = ["error"]
                    register_list.sort()
                    register = "-".join(register_list)
                if register in results.keys():
                    results[register] += 1
                else:
                    results[register] = 1

In [3]:
import json

print(json.dumps(results, indent=4))

{
    "IP-ds": 314630,
    "MT": 106152,
    "IN-dtp": 525798,
    "hybrid": 750722,
    "IN": 270598,
    "NA-sr": 85184,
    "no-label": 81476,
    "NA": 43535,
    "NA-ne": 292596,
    "NA-nb": 156087,
    "ID": 208017,
    "OP-av": 10661,
    "HI-re": 23917,
    "IN-ra": 23360,
    "OP": 35292,
    "OP-rs": 26311,
    "OP-rv": 71956,
    "OP-ob": 26874,
    "IN-lt": 11858,
    "HI": 34454,
    "IP": 30147,
    "IN-en": 29511,
    "SP-it": 6363,
    "SP": 1360,
    "LY": 14883,
    "IN-fi": 1046,
    "error": 7
}


In [4]:
parsed = []
for main_register in LABEL_HIERARCHY.keys():
    for full_register, value in results.items():
        if main_register in full_register:
            #print(main_register, full_register)
            parsed.append([full_register, main_register, value])

parsed.append(["hybrid", "hybrid", results["hybrid"]])

parsed.append(["error", "error", results["error"]])


In [52]:
import pandas as pd

#df = pd.DataFrame.from_dict(results.items())
df =pd.DataFrame(parsed, columns=["full", "main", "value"])

df

Unnamed: 0,full,main,value
0,MT,MT,106152
1,LY,LY,14883
2,SP-it,SP,6363
3,SP,SP,1360
4,ID,ID,208017
5,NA-sr,,85184
6,,,43535
7,NA-ne,,292596
8,NA-nb,,156087
9,HI-re,HI,23917


In [41]:
import plotly.express as px 

#fig = px.pie(df, values=1, names=0)
fig = px.sunburst(df, path=['main', 'full'], values='value')

In [42]:
fig.show()

In [53]:
df = df[["main","full","value"]].sort_values(["main","full"], ascending=True)

In [55]:

summ = df.sum(numeric_only=True)
print(summ[0])
df["percentage"] = df["value"]/summ[0]

3182795


In [56]:
df

Unnamed: 0,main,full,value,percentage
10,HI,HI,34454,0.010825
9,HI,HI-re,23917,0.007514
4,ID,ID,208017,0.065357
12,IN,IN,270598,0.085019
11,IN,IN-dtp,525798,0.1652
15,IN,IN-en,29511,0.009272
16,IN,IN-fi,1046,0.000329
14,IN,IN-lt,11858,0.003726
13,IN,IN-ra,23360,0.007339
23,IP,IP,30147,0.009472


In [67]:
d = {}
for i, row in df.iterrows():
    if row.main in d.keys():
        d[row.main] += row.value/summ[0]
    else:
        d[row.main] = 0
        d[row.main] += row.value/summ[0]
for k,v in d.items():
    print(f'{k} & {v} \ ')

HI & 0.018339541189426274 \ 
ID & 0.06535670691954713 \ 
IN & 0.270884866917285 \ 
IP & 0.10832522986871602 \ 
LY & 0.004676078729544316 \ 
MT & 0.03335181813468979 \ 
NA & 0.18141350605364154 \ 
OP & 0.05375589693963953 \ 
SP & 0.0024264836409507995 \ 
error & 2.1993248072841636e-06 \ 
hybrid & 0.23586878828199742 \ 
no-label & 0.02559888399975493 \ 
