In [None]:
import pickle

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

import etymmap.specific_en


from etymmap.wiktionary import Wiktionary, MongoEntryStore
from etymmap.extraction import init_and_configure

from utils.gloss_similarity import *

In [None]:
etymmap.specific_en.configure()
DATA_PATH = Path("./data/enwiktionary-20220601-pages-meta-current/")

store = MongoEntryStore.from_config(
    {
        "address": "mongodb://localhost:27017",
        "dbname": "enwiktionary",
        "collection": "20220601",
    }
)
enw = Wiktionary(store, default_namespaces=(0, 118))

In [None]:
item_collector = GlossSimilarityItemCollector(enw, cache=DATA_PATH / "lexicon.pickle")

In [None]:
item_collector.run(data)

In [None]:
# get the relevant data

train_cols = ["match", "temp_gloss", "definition", "sense_idx", "term", "lang"]
gold_std_sense = pd.read_csv(data / "gold_standard_sense.csv")[train_cols]
gold_std_etym = pd.read_csv(data / "gold_standard_etym.csv")[train_cols]
# we use previously annotated data here
annotated = pd.read_csv(data / "annotate20210401.csv")[train_cols]
with_value = (annotated.match == 0.0) | (annotated.match == 1.0)
not_annotated = annotated[~with_value]
annotated = annotated[with_value]
annotated

In [None]:
# select actual training data here

all_data = pd.concat([annotated, gold_std_sense], ignore_index=True)
all_data = all_data[all_data.definition.notnull()]
len(annotated), len(gold_std_sense), len(all_data)

In [None]:
# train/test by entries

by_lex = all_data.set_index(["term", "lang"])
all_entries = by_lex.index.drop_duplicates()
train_lex = all_entries.to_frame().sample(frac=0.75, random_state=33)
train = by_lex.loc[train_lex.index]
test = by_lex.loc[~by_lex.index.isin(train_lex.index)]

# also, for evaluation, create test set where mapping to the correct sense is considered correct
l = []
for d in (train, test):
    sense_match = (
        d.reset_index()
        .groupby(["term", "lang", "sense_idx", "temp_gloss"])
        .match.agg(max)
    )
    t = d.set_index(["sense_idx", "temp_gloss"], append=True)
    t["sense_match"] = sense_match
    l.append(t.reset_index(level=["sense_idx", "temp_gloss"]))
train, test = l

In [None]:
# number of items and entries

len(train), len(test), len(train.index.drop_duplicates()), len(
    test.index.drop_duplicates()
)

In [None]:
# featurize

trainF = pd.DataFrame.from_records(
    [
        featurize(definition, temp_gloss)
        for definition, temp_gloss in zip(train.definition, train.temp_gloss)
    ]
)
testF = pd.DataFrame.from_records(
    [
        featurize(definition, temp_gloss)
        for definition, temp_gloss in zip(test.definition, test.temp_gloss)
    ]
)

In [None]:
#  evalutate all single-var models
_, predictions = get_univariate_predictions(trainF, train, testF, test, cv=10)
predictions

In [None]:
performance = predictions.groupby(level=0, axis=1).apply(
    lambda df: pd.DataFrame(
        {
            f"{pred}_{l}": precision_recall_fscore_support(
                compare, df.loc[:, (slice(None), pred)], average="binary"
            )[:3]
            for l, compare in [("pairwise", test.match), ("sense", test.sense_match)]
            for pred in ["pairwise", "argmax"]
        },
        index=["prec", "rec", "f1"],
    )
)
performance.head()

In [None]:
fuzzy_tversky, tversky = [
    performance.loc[
        :,
        (
            [c for c in performance.columns.get_level_values(0) if c.startswith(pref)],
            slice(None),
        ),
    ]
    for pref in ["fuzzy", "tversky"]
]

In [None]:
fuzzy_tversky.groupby(level=1, axis=1).apply(
    lambda df: df[df.idxmax(axis=1)].set_axis(
        ["best_prec", "best_rec", "best_f1"], axis=1
    )
)

In [None]:
tversky.groupby(level=1, axis=1).apply(
    lambda df: df[df.idxmax(axis=1)].set_axis(
        ["best_prec", "best_rec", "best_f1"], axis=1
    )
)

In [None]:
print(
    tversky["tversky_0.32"]
    .multiply(100)
    .iloc[:, [0, 1, 3]]
    .to_latex(float_format="{:.2f}".format)
)

In [None]:
# univariate performances
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
for i, (label, data) in enumerate(
    [
        ("tversky", tversky.loc[:, (slice(None), "pairwise_pairwise")]),
        ("fuzzy tversky", fuzzy_tversky.loc[:, (slice(None), "pairwise_pairwise")]),
    ]
):
    axes[i].set_title(label)
    axes[i].plot(data.T.values, label=["prec", "rec", "F1"])
    axes[i].set_xticks([0, 25, 51])
    axes[i].set_xlabel("alpha")
    axes[i].set_xticklabels([0, 0.5, 1.0])
    axes[i].legend()
    axes[i].grid()

all_features = [f for f in trainF.columns if "tversky" not in f] + [
    "tversky_0.32",
    "fuzzy_tversky_0.06",
]
data = performance.loc[:, (all_features, "pairwise_pairwise")]
axes[2].xaxis.set_major_locator(plt.FixedLocator(range(len(all_features))))
for tick in axes[2].get_xticklabels():
    tick.set_rotation(90)
axes[2].bar(all_features, data.loc["prec"].to_list(), label="prec", width=0.5, zorder=3)
axes[2].bar(
    all_features,
    data.loc["rec"].to_list(),
    label="rec",
    width=0.5,
    align="edge",
    zorder=2,
)
axes[2].legend()
axes[2].grid(axis="y", zorder=1)

In [None]:
base_features = [c for c in trainF.columns if "tversky" not in c] + [
    "tversky_0.32",
    "fuzzy_tversky_0.06",
]

In [None]:
# all with only best tversky
all_features = get_evaluation(trainF, train, testF, test, base_features)
all_features

In [None]:
# no fuzzy features
feats = [f for f in base_features if "levenshtein" not in f and "fuzzy" not in f]
no_fuzzy = get_evaluation(trainF, train, testF, test, feats)
no_fuzzy

In [None]:
*_, p = get_multivariate_predictions(trainF, train, testF, test, feats)
t2 = pd.concat([test.reset_index(), p], axis=1)
t2.set_index(["term", "lang"], inplace=True)
# get all entries surrounding errors
t2.loc[
    t2.loc[~(p.argmax.astype(bool).values) & t2.match.astype(bool).values].index
].to_csv(DATA_PATH / "false_negatives.csv")
t2.loc[
    t2.loc[(p.argmax.astype(bool).values) & ~t2.match.astype(bool).values].index
].to_csv(DATA_PATH / "false_positives.csv")

In [None]:
# only cutoff levenshtein
feats = [f for f in base_features if f not in {"char_levenshtein", "word_levenshtein"}]
get_evaluation(trainF, train, testF, test, feats)

In [None]:
# only character features
feats = [f for f in base_features if f.startswith("char")]
get_evaluation(trainF, train, testF, test, feats)

In [None]:
# only tversky
get_evaluation(trainF, train, testF, test, ["tversky_0.32", "fuzzy_tversky_0.06"])

In [None]:
# no tversky
feats = [f for f in base_features if "tversky" not in f]
get_evaluation(trainF, train, testF, test, feats)

In [None]:
# only ratios and lcsm
feats = [f for f in base_features if "ratio" in f or "longest_match" in f]
get_evaluation(trainF, train, testF, test, feats)

In [None]:
# export models

all_train = pd.DataFrame.from_records(
    [
        featurize(definition, temp_gloss)
        for definition, temp_gloss in zip(all_data.definition, all_data.temp_gloss)
    ]
)

for name, features in ("all_features", base_features), (
    "no_fuzzy",
    [f for f in base_features if "levenshtein" not in f and "fuzzy" not in f],
):
    train_ = all_train[features]
    scaler = StandardScaler().fit(train_)
    params = dict(
        Cs=10,
        cv=10,
        random_state=33,
        solver="lbfgs",
        multi_class="ovr",
        max_iter=100,
        class_weight="balanced",
    )
    LR = LogisticRegressionCV(**params)
    LR.fit(scaler.transform(train_), all_data.match)
    with open(DATA_PATH / f"{name}.model", "wb") as dest:
        pickle.dump(LR, dest)
    with open(DATA_PATH / f"{name}.scaler", "wb") as dest:
        pickle.dump(scaler, dest)

In [None]:
# other models

from sklearn import tree

scaler = StandardScaler().fit(trainF[base_features])
clf = tree.DecisionTreeClassifier().fit(
    scaler.transform(trainF[base_features]), train.match
)
multi_tree_predictions = clf.predict(scaler.transform(testF[base_features]))
precision_recall_fscore_support(test.match, multi_tree_predictions, average="binary")

In [None]:
from sklearn.ensemble import RandomForestClassifier

scaler = StandardScaler().fit(trainF[base_features])
clf = RandomForestClassifier().fit(scaler.transform(trainF[base_features]), train.match)
multi_tree_predictions = clf.predict(scaler.transform(testF[base_features]))
precision_recall_fscore_support(test.match, multi_tree_predictions, average="binary")

In [None]:
from sklearn import svm

scaler = StandardScaler().fit(trainF[base_features])
clf = svm.SVC().fit(scaler.transform(trainF[base_features]), train.match)
multi_tree_predictions = clf.predict(scaler.transform(testF[base_features]))
precision_recall_fscore_support(test.match, multi_tree_predictions, average="binary")