<a href="https://colab.research.google.com/github/pkolachi/lexicalnormalization/blob/master/exptnbs/LexicalNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[WNUT21 Shared Task Website](http://noisy-text.github.io/2021/multi-lexnorm.html)

### Setup and Configuration

In [None]:
# We no longer clone the github repository. Instead this notebook
# is part of the repository itself
# !git clone https://github.com/pkolachi/lexicalnormalization

In [None]:
%pip install --user -U pandas==1.1.5
%pip install --user -U scikit-learn==0.22.2.post1
%pip install --user -U sklearn-crfsuite

In [None]:
import itertools as it
import os.path
from collections import defaultdict
from operator import itemgetter

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    cross_validate,
    train_test_split,
)

In [None]:
REPO_NAME = "multilexnorm"
LANGS = {
    "da": "Danish",
    "en": "English",
    "es": "Spanish",
    "hr": "Croatian",
    "iden": "Indonesian-English",
    "it": "Italian",
    "nl": "Dutch",
    "sl": "Slovenian",
    "sr": "Serbian",
    "tr": "Turkish",
    "trde": "Turkish-German",
}
SMPLS = LANGS.keys()
EMPTY_TOKEN = "+-#EMPTOK#-+"
EMPTY_LABEL = "+-#MERGE#-+"  #''
PAD_TOKEN = "+-#NULL#-+"
PAD_LABEL = "+-#DROP#-+"
TST_RATIO = 0.15  # use 15% of training data as held-out data for evaluation
CVFOLDS = 4  # use 4-folds for cross-fold training throughout experiments

### Load Data

In [None]:
def load_data(inpfile, empty_label=""):
    with open(inpfile) as inf:
        # break lines into sentence blocks
        snb = (list(it.takewhile(lambda lne: lne.strip(), inf)) for _ in it.count(1))
        # deal with errors in file format especially turkish
        snc = it.dropwhile(lambda snt: len(snt) == 0, snb)
        # terminate this infinite stream
        snd = it.takewhile(lambda snt: len(snt) > 0, snc)
        # split into fields
        crs = ([t.strip("\n").split("\t", 1) for t in s] for s in snd)
        if empty_label:
            crp = [
                [
                    (tok[0], tok[1] if len(tok) > 1 and tok[1].strip() else empty_label)
                    for tok in sent
                ]
                for sent in crs
            ]
        else:
            crp = list(crs)
        return crp


# remove sentences that do not follow the expected format
sanitize_crps = lambda sent: all(len(fields) == 2 for fields in sent)
# get input from tuple (raw sentences)
get_rawtokens = lambda sent: list(map(itemgetter(0), sent))
# get output/labels from tuple (normalized sentences)
get_nrmtokens = lambda sent: list(map(itemgetter(1), sent))

DATA = defaultdict(lambda: defaultdict(lambda: ([], [])))
for lang in SMPLS:
    datadir = os.path.join("..", REPO_NAME, "data", lang)
    trnfile = os.path.join(datadir, "train.norm")
    devfile = os.path.join(datadir, "dev.norm")
    tstfile = os.path.join(datadir, "test.norm")
    for dts, dtf in [("fulltrn", trnfile), ("dev", devfile), ("tst", tstfile)]:
        if os.path.isdir(datadir) and os.path.isfile(dtf):
            ocrp = list(load_data(dtf, empty_label=EMPTY_LABEL))
            # sanitize corpus to make sure
            fcrp = list(filter(sanitize_crps, ocrp))
            if len(ocrp) != len(fcrp):
                print(f"Removed {len(ocrp) - len(fcrp)} sentences from {dtf}")
            X = list(map(get_rawtokens, fcrp))
            Y = list(map(get_nrmtokens, fcrp))
            DATA[lang][dts] = (X, Y)

In [None]:
# %rm -rf $REPO_NAME

### Data statistics

In [None]:
for lang in SMPLS:
    if "fulltrn" in DATA[lang]:
        trn_x, hld_x, trn_y, hld_y = train_test_split(
            DATA[lang]["fulltrn"][0],
            DATA[lang]["fulltrn"][1],
            test_size=TST_RATIO,
            random_state=0,
            shuffle=False,
        )
        DATA[lang]["trn"] = (trn_x, trn_y)
        DATA[lang]["hld"] = (hld_x, hld_y)

In [None]:
columns = ["Language", "Training", "Held-out", "Devel", "Test"]
datasizes = [
    [LANGS[lang]] + [len(DATA[lang][crp][0]) for crp in ("trn", "hld", "dev", "tst")]
    for lang in SMPLS
]
datasizes = pd.DataFrame.from_records(datasizes, columns=columns)

datasizes

In [None]:
datasizes["AllToks#"] = [
    len(
        {
            tok
            for dts in ("trn", "hld", "dev", "tst")
            for typ in (0, 1)
            for sent in DATA[lang][dts][typ]
            for tok in sent
        }
    )
    for lang in SMPLS
]

datasizes["Vocab#"] = [
    len(
        {
            tok
            for dts in ("trn", "hld", "dev", "tst")
            for sent in DATA[lang][dts][0]
            for tok in sent
        }
    )
    for lang in SMPLS
]
datasizes["Labels#"] = [
    len(
        {
            tok
            for dts in ("trn", "hld", "dev", "tst")
            for sent in DATA[lang][dts][1]
            for tok in sent
        }
    )
    for lang in SMPLS
]

datasizes["Trn. Vocab#"] = [
    len({tok for sent in DATA[lang]["trn"][0] for tok in sent}) for lang in SMPLS
]
datasizes["Trn. Label#"] = [
    len({tok for sent in DATA[lang]["trn"][1] for tok in sent}) for lang in SMPLS
]

datasizes["Ood. Vocab%"] = [
    len(
        {
            tok
            for dts in ("hld", "dev", "tst")
            for sent in DATA[lang][dts][0]
            for tok in sent
        }.difference({tok for sent in DATA[lang]["trn"][0] for tok in sent})
    )
    for lang in SMPLS
]
datasizes["Ood. Label%"] = [
    len(
        {
            tok
            for dts in ("hld", "dev", "tst")
            for sent in DATA[lang][dts][1]
            for tok in sent
        }.difference({tok for sent in DATA[lang]["trn"][0] for tok in sent})
    )
    for lang in SMPLS
]
datasizes["Ood. Vocab%"] = 100 * datasizes["Ood. Vocab%"] / datasizes["Vocab#"]
datasizes["Ood. Label%"] = 100 * datasizes["Ood. Label%"] / datasizes["Labels#"]

datasizes

### Task Baselines

In [None]:
FOLDS = defaultdict(list)
kcvs = RepeatedKFold(n_splits=CVFOLDS, n_repeats=5, random_state=0)
for lang in SMPLS:
    FOLDS[lang].extend(kcvs.split(DATA[lang]["trn"][0]))

In [None]:
# Leave-As-Is i.e. return input as output
class LAI(BaseEstimator, ClassifierMixin):
    def __init__(self, pad_tok=PAD_TOKEN, pad_lbl=PAD_LABEL):
        self.__pad_tok = pad_tok
        self.__pad_lbl = pad_lbl
        self.__scores = defaultdict(float)
        pass

    def fit(self, X, Y):
        max_len = max(it.chain((len(seq) for seq in X), (len(seq) for seq in Y)))
        # X = np.asarray([seq + [self.__pad_tok]*(max_len-len(seq)) for seq in X], dtype=str)
        # Y = np.asarray([seq + [self.__pad_lbl]*(max_len-len(seq)) for seq in Y], dtype=str)
        return

    def predict(self, X):
        max_len = max(len(seq) for seq in X)
        # X = np.asarray([seq + [self.__pad_tok]*(max_len-len(seq)) for seq in X], dtype=str)
        return X

    def score(self, X, Y, ignoreCase=False):
        prdY = self.predict(X)
        zipS = ((inp, out, oup) for inp, out, oup in zip(X, Y, prdY))
        # eliminate instances if lengths do not match
        zipF = (seq for seq in zipS if len(seq[1]) == len(seq[2]))
        tokS = ((rawW, gldW, prdW) for seq in zipF for rawW, gldW, prdW in zip(*seq))
        correct, changed, total = 0, 0, 0
        for rawW, gldW, prdW in tokS:
            total += 1
            if ignoreCase:
                rawW = rawW.lower()
                gldW = gldW.lower()
                prdW = prdW.lower()
            if rawW != gldW:
                changed += 1
            if gldW == prdW:
                correct += 1
        # evaluation used in the shared task
        self.__scores["accuracy"] = correct / total
        self.__scores["lai"] = (total - changed) / total
        if self.__scores["lai"] == 1:
            self.__scores["err"] = 0
        else:
            self.__scores["err"] = (
                self.__scores["accuracy"] - self.__scores["lai"]
            ) / (1 - self.__scores["lai"])
        return self.__scores["accuracy"]

In [None]:
# Most-Frequent-Replacement
class MFR(LAI):
    def __init__(self):
        super().__init__()
        self.__counts = defaultdict(lambda: defaultdict(int))

    def fit(self, X, Y):
        super().fit(X, Y)
        replacements = (
            (itok, otok) for iseg, oseg in zip(X, Y) for itok, otok in zip(iseg, oseg)
        )
        for inp, rpl in replacements:
            self.__counts[inp][rpl] += 1

    def predict(self, X):
        X = super().predict(X)
        prdY = []
        for iseg in X:
            oseg = []
            for itok in iseg:
                lns = self.__counts[itok]
                if len(lns) == 0:
                    oseg.append(itok)
                else:
                    oseg.append(sorted(lns.items(), key=itemgetter(1))[0][0])
            prdY.append(oseg)
        return prdY

In [None]:
for lang in SMPLS:
    cvres = cross_validate(
        LAI(),
        DATA[lang]["trn"][0],
        DATA[lang]["trn"][1],
        cv=FOLDS[lang],
        return_train_score=True,
    )
    print(lang, sum(acc for acc in cvres["test_score"]) / len(cvres["test_score"]))

### Sequence classification using PyTorch

##### Preprocessing

In this step, we convert the sequence of tokens into an embedding matrix. 
This step relies on tokenizers and pre-trained models from ``huggingface``.
This seperation of preprocessing should allow for training other classifiers 
than neural versions using ``scikit-learn`` or other packages.

In [None]:
for lang in SMPLS:
    # load tokenizer model from ``huggingface``
    for dts in DATA[lang]:
        for inp, out in zip(DATA[lang][dts][0], DATA[lang][dts][1]):
            tinp = []
            tout = []
            for intok, outok in zip(inp, out):
                pass

### Sequence classification using HMM models