# Extraction - Sections & Templates

This Notebook explores the interesting sections in the example dump (20220601).

It answers the following questions:
* How many sections of each type exist?
* Can we rely on section names?
* Which templates are characteristical for a seciton, what is the ratio of normal Wikilinks?
* For Etymology: how much can we rely on the template mechanism?

In [None]:
import re
from collections import defaultdict, Counter
from pathlib import Path
import logging
import json

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("matplotlib").setLevel(logging.WARNING)

import numpy as np
import matplotlib as plt
import pandas as pd
from tqdm import tqdm
import wikitextparser as wtp

from etymmap.wiktionary import Wiktionary, MongoEntryStore
from etymmap.specific_en.utils import strip_etymology_header
from etymmap.specific_en import configure, consts

from utils import cached
from utils.sections import *


DATA_PATH = Path("./data/enwiktionary-20220601-pages-meta-current")
# configure language module
configure()
store = MongoEntryStore.from_config(
    {
        "address": "mongodb://localhost:27017",
        "dbname": "enwiktionary",
        "collection": "20220601",
    }
)
enw = Wiktionary(store, default_namespaces=(0, 118))

# Can we rely on section names?

In [None]:
section_count = cached(lambda: count_sections(enw), DATA_PATH / "section_names.pickle")
section_count

In [None]:
sum(section_count["etym"].values())

# Template type frequencies by section

In [None]:
templates_by_section = cached(
    lambda: template_counts_by_section(enw), DATA_PATH / "section_templates.pickle"
)

In [None]:
def to_df(name):
    df = pd.Series(
        {k: v for k, v in templates_by_section[name].most_common()},
        dtype="uint32",
        name="count",
    ).to_frame()
    df["ratio"] = df["count"] / df["count"].sum()
    df["ratio_no_wikilink"] = df["count"] / (
        df["count"].sum() - df.loc["WIKILINK", "count"]
    )
    return df

## Derived terms

In [None]:
print("\n".join(section_examples(enw, consts.DERIVED_TERMS)))

In [None]:
df = to_df("Derived")
df["count"].iloc[:40].plot(kind="bar", figsize=(16, 5))

In [None]:
# grouping
groups = (
    df[["ratio", "ratio_no_wikilink"]]
    .groupby(
        by=lambda t: "link"
        if t == "l"
        else "wikilink"
        if t == "WIKILINK"
        else "col"
        if t.startswith("rel") or t.startswith("col")
        else "qualifier"
        if t in {"q", "qualifier"}
        else "zh"
        if "zh-" in t
        else "ja"
        if "ja-" in t
        else "zoo"
        if t in {"vern", "taxlink"}
        else "other"
    )
    .sum()
    .sort_values("ratio", ascending=False)
)
groups.loc["wikilink", "ratio_no_wikilink"] = None
groups

## Related terms

In [None]:
print("\n".join(section_examples(enw, consts.RELATED_TERMS)))

In [None]:
df = to_df("Related")
print(len(df))
df["count"].iloc[:40].plot(kind="bar", figsize=(16, 5))

In [None]:
# grouping
groups = (
    df[["ratio", "ratio_no_wikilink"]]
    .groupby(
        by=lambda t: "link"
        if t == "l"
        else "wikilink"
        if t == "WIKILINK"
        else "col"
        if t.startswith("rel") or t.startswith("col")
        else "qualifier"
        if t in {"q", "qualifier"}
        else "zh"
        if "zh-" in t
        else "ja"
        if "ja-" in t
        else "other"
    )
    .sum()
    .sort_values("ratio", ascending=False)
)
groups.loc["wikilink", "ratio_no_wikilink"] = None
groups

## Descendants

In [None]:
print("\n".join(section_examples(enw, consts.DESCENDANTS)))

In [None]:
df = to_df("Descendants")
print(len(df))
df["count"].iloc[:40].plot(kind="bar", figsize=(16, 5))

In [None]:
# grouping
groups = (
    df[["ratio", "ratio_no_wikilink"]]
    .groupby(
        by=lambda t: "desc"
        if t in {"desc", "desctree"}
        else "wikilink"
        if t == "WIKILINK"
        else "l"
        if t == "l"
        else "qualifier"
        if t in {"q", "qualifier"}
        else "cat"
        if t in {"c", "C", "topics"}
        else "CJKV"
        if t == "CJKV"
        else "other"
    )
    .sum()
    .sort_values("ratio", ascending=False)
)
groups.loc["wikilink", "ratio_no_wikilink"] = None
groups

## Etymology

In [None]:
print("\n".join(section_examples(enw, consts.ETYMOLOGY_SECTION)))

In [None]:
df = to_df("Etym")
print(len(df))
df["count"].iloc[:40].plot(kind="bar", figsize=(16, 5))

In [None]:
df.head(30)

In [None]:
known_templates = df[
    df.index.isin(
        [v for vals in Specific.template_handler.get_names().values() for v in vals]
    )
]
known_templates.sum()

In [None]:
known_templates["count"].head(50)

In [None]:
unknown_templates = df[
    ~df.index.isin(
        [v for vals in Specific.template_handler.get_names().values() for v in vals]
    )
]
unknown_templates.sum()

In [None]:
unknown_templates.head(30)

In [None]:
def to_specificity_type(values: pd.Series):
    th = Specific.template_handler
    to_representative = {
        v: k for k, vs in th.get_names().items() for v in vs if isinstance(v, str)
    }
    patterns_to_representative = {
        v: k
        for k, vs in th.get_names().items()
        for v in vs
        if isinstance(v, re.Pattern)
    }
    types = {
        representative: relation_type
        for relation_type, representatives in th.get_relation_mapping().items()
        for representative in representatives
    }
    ret = defaultdict(int)
    for name, val in values.items():
        if name == "WIKILINK":
            continue
        try:
            t = to_representative[name]
        except KeyError:
            for p, t in patterns_to_representative.items():
                if p.match(name):
                    break
            else:
                t = ""
        if t in types:
            type_ = types[t]
            if not type_:
                spectype = "plaintext_registered"
            elif type_.is_a(RelationType.ORIGIN):
                spectype = "origin"
            elif type_.is_a(RelationType.SIBLING):
                spectype = "sibling"
            else:
                spectype = "related"
        else:
            spectype = "unknown_template"
        ret[spectype] += val
    return pd.Series(ret, name=values.name)

In [None]:
relations = to_specificity_type(df["count"]).sort_values(ascending=False)

In [None]:
rd = pd.concat([relations, relations / relations.sum()], axis=1)
rd.columns = ["count", "ratio"]
rd

In [None]:
def count_etymology_section_types(wiktionary):
    ret = {
        k: []
        for k in [
            "origin",
            "sibling",
            "related",
            "stub",
            "empty",
            "unhandled",
            "no_templates",
        ]
    }
    for _, text, ctx in tqdm(
        wiktionary.sections(consts.ETYMOLOGY_SECTION), total=1.67 * 10**6
    ):
        key = ctx["title"], ctx["language"]
        if not strip_etymology_header(text).strip():
            ret["empty"].append(key)
        else:
            templates = wtp.parse(text).templates
            if not templates:
                ret["no_templates"].append((key, text))
            else:
                c = {k: 0 for k in spec_types}
                for template in templates:
                    spec_type = template_spec_type(template)
                    c[spec_type] += 1
                    if spec_type == "origin":
                        break
                for spec_type in spec_types:
                    if c[spec_type] > 0:
                        ret[spec_type].append((key, text))
                        break
    return ret

In [None]:
by_type = cached(
    lambda: count_etymology_section_types(enw), DATA_PATH / "etymology_sections.pickle"
)

In [None]:
for k in ("sibling", "related", "unhandled", "no_templates"):
    with open(DATA_PATH / f"by_type_{k}.txt", "w") as dest:
        for key, section in by_type[k]:
            dest.write(f"{key[0]} {key[1]}\n")
            dest.write(strip_etymology_header(section).strip())
            dest.write("\n\n")

In [None]:
total = sum(len(v) for v in by_type.values())

{k: (len(v), 100 * len(v) / total) for k, v in by_type.items()}, total

In [None]:
def count_unknown_templates():
    c = Counter()
    for cat in ["related", "unhandled"]:
        for key, section in by_type[cat]:
            for t in wtp.parse(section).templates:
                c[t.name.strip()] += 1
    return c

In [None]:
unknown_templates = count_unknown_templates()

In [None]:
unknown_templates.most_common()