# Statistics about annotations

In [1]:
from typing import Dict, List, Set

import json
from glob import glob
import itertools

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from util import story_tokenize, collect_tokens
from create import tokenize_values, load_source, calc_occurences

In [2]:
# import nltk
# nltk.download('wordnet')

In [3]:
from stemmers import stemmers

tkn = "sb"

In [4]:
from corpora import corpora
from template import venn_templ

In [6]:
values, values_backref = tokenize_values(tkn, fname="values-edited.flat")
# values, values_backref = tokenize_values(tkn, fname="values-edited")
# print(sum(len(v) for v in values.values()))
# values

In [7]:
fulltexts, tokenized = load_source(stemmers[tkn], corpora)
# fulltexts

In [8]:
occurences, occurences_tv, occurences_backref = calc_occurences(values, tokenized)
sum(v for v in occurences.values()), sum(1 for o in occurences_backref.keys())

(3995, 68)

In [9]:
ckeywords: Dict[str, Set[str]] = {}
for c in corpora:
    ckeywords[c] = set(
        itertools.chain(
            *[list(v.keys()) for k, v in occurences_tv.items() if k.startswith(c)]
        )
    )

In [10]:
keywords = sorted(list(set(occurences_backref.keys())))
print(len(keywords))
keywords

68


['abl',
 'accept',
 'angel',
 'brother',
 'claim',
 'clever',
 'compass',
 'confid',
 'convers',
 'correct',
 'curios',
 'curious',
 'equal',
 'evid',
 'fair',
 'faith',
 'father',
 'free',
 'gentl',
 'god',
 'good',
 'gracious',
 'harmoni',
 'help',
 'honest',
 'honor',
 'hospit',
 'husband',
 'innoc',
 'jewel',
 'judg',
 'just',
 'justic',
 'kind',
 'king',
 'know',
 'knowledg',
 'law',
 'liberti',
 'love',
 'marri',
 'marriag',
 'mother',
 'pay',
 'peac',
 'permiss',
 'pieti',
 'pious',
 'piti',
 'pray',
 'prize',
 'punish',
 'pure',
 'queen',
 'reason',
 'reward',
 'right',
 'saint',
 'sister',
 'support',
 'togeth',
 'treasur',
 'trial',
 'truth',
 'virgin',
 'wed',
 'wife',
 'wise']

In [11]:
unused_values = set(values.keys())
for k in keywords:
    unused_values.remove(k)
unused_values

{'abil',
 'allianc',
 'altruism',
 'author',
 'benefit',
 'benevol',
 'bounti',
 'chariti',
 'compens',
 'cooper',
 'courtesi',
 'devot',
 'dialog',
 'dialogu',
 'disciplin',
 'emancip',
 'empathi',
 'frank',
 'freedom',
 'generos',
 'generous',
 'honesti',
 'independ',
 'ingenu',
 'intellig',
 'loyal',
 'loyalti',
 'major',
 'payment',
 'puriti',
 'revel',
 'rule',
 'sacriﬁc',
 'selfless',
 'smart',
 'solidar',
 'spous',
 'sympathi',
 'talent',
 'toler',
 'wisdom'}

In [12]:
from IPython.core.display import display, HTML

# de_keywords = {'a', 'd'}
# it_keywords = {'b', 'd'}
# pt_keywords = {'c', 'd'}


de_keywords = ckeywords["Germany"]
it_keywords = ckeywords["Italy"]
pt_keywords = ckeywords["Portugal"]

params = {
    "de_it_pt": de_keywords & it_keywords & pt_keywords,
    "de": de_keywords - it_keywords - pt_keywords,
    "it": it_keywords - de_keywords - pt_keywords,
    "pt": pt_keywords - de_keywords - it_keywords,
}
params["de_it"] = (de_keywords & it_keywords) - params["de_it_pt"]
params["de_pt"] = (de_keywords & pt_keywords) - params["de_it_pt"]
params["it_pt"] = (it_keywords & pt_keywords) - params["de_it_pt"]

params = {
    k: "".join(
        [f'<tspan x="0" y="{i*10}">{p}</tspan>' for i, p in enumerate(list(params[k]))]
    )
    for k, v in params.items()
}

# print(params)
result = venn_templ.format(**params)
with open(f"venn.{tkn}.svg", "w") as f:
    f.writelines(result)
display(HTML(result))
# display(HTML(table_templ.format(**params)))

  from IPython.core.display import display, HTML
