[WNUT21 Shared Task Website](http://noisy-text.github.io/2021/multi-lexnorm.html)

### Setup and Configuration

In [1]:
# We no longer clone the github repository. Instead this notebook
# is part of the repository itself
# !git clone https://github.com/pkolachi/lexicalnormalization

In [2]:
# %pip install --user -U pandas==1.1.5
# %pip install --user -U scikit-learn==0.22.2.post1
# %pip install --user -U sklearn-crfsuite

In [3]:
import itertools as it
import os.path
import random
from collections import defaultdict
from dataclasses import dataclass
from operator import itemgetter

In [4]:
import numpy as np
import pandas as pd
from IPython.display import Markdown, display
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    cross_validate,
    train_test_split,
)



In [5]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [6]:
REPO_NAME = "multilexnorm"
LANGS = {
    "da": "Danish",
    "en": "English",
    "es": "Spanish",
    "hr": "Croatian",
    "iden": "Indonesian-English",
    "it": "Italian",
    "nl": "Dutch",
    "sl": "Slovenian",
    "sr": "Serbian",
    "tr": "Turkish",
    "trde": "Turkish-German",
}
SAMPLE_LANGS = ["en", "es", "it", "da"]
TST_RATIO = 0.15  # use 15% of training data as held-out data for evaluation
CVFOLDS = 4  # use 4-folds for cross-fold training throughout experiments

### Load Data

In [7]:
@dataclass
class AlignedParSent:
    src: str = ""
    tgt: str = ""
    alignments: tuple = ()
    slang: str = "en"
    tlang: str = slang

In [8]:
def block2sent(block):
    src_sent, tgt_sent = [], []
    alignments = []
    src_idx, tgt_idx = 0, 0
    for entry in block:
        src, tgt = entry.split("\t", 1)
        src_toks, tgt_toks = src.split(), tgt.split()
        src_tok_ids = range(src_idx, src_idx + len(src_toks))
        tgt_tok_ids = range(tgt_idx, tgt_idx + len(tgt_toks))
        src_sent.extend(src_toks)
        tgt_sent.extend(tgt_toks)
        alignments.extend(it.product(src_tok_ids, tgt_tok_ids))
        src_idx += len(src_toks)
        tgt_idx += len(tgt_toks)
    par_sent = AlignedParSent(
        " ".join(src_sent), " ".join(tgt_sent), tuple(alignments)
    )
    return par_sent

In [9]:
def load_data(inpfile):
    corpus = []
    with open(inpfile) as infile:
        # break lines into sentence blocks
        block = []
        for line in infile:
            if not line.strip() and len(block):
                corpus.append(block2sent(block))
                block = []
            elif not line.strip():
                continue
            else:
                block.append(line.strip("\n"))
        if len(block):
            corpus.append(block2sent(block))
            block = []
    return corpus

In [10]:
def is_proper(sent):
    return (
        len(sent.alignments) != 0
        and max(map(itemgetter(0), sent.alignments)) < len(sent.src.split())
        and max(map(itemgetter(1), sent.alignments)) < len(sent.tgt.split())
    )

In [11]:
DATA = defaultdict(lambda: defaultdict(list))
for lang in SAMPLE_LANGS:
    data_directory = os.path.join("..", REPO_NAME, "data", lang)
    train_file = os.path.join(data_directory, "train.norm")
    dev_file = os.path.join(data_directory, "dev.norm")
    test_file = os.path.join(data_directory, "test.norm")
    for dts, dtf in [
        ("fulltrn", train_file),
        ("dev", dev_file),
        ("tst", test_file),
    ]:
        if os.path.isdir(data_directory) and os.path.isfile(dtf):
            corpus_all = list(load_data(dtf))
            # sanitize corpus to make sure
            corpus_filtered = list(filter(is_proper, corpus_all))
            if len(corpus_all) != len(corpus_filtered):
                print(
                    f"Removed {len(corpus_all) - len(corpus_filtered)} sentences from {dtf}"
                )
            DATA[lang][dts] = corpus_filtered

In [12]:
for lang in SAMPLE_LANGS:
    if "fulltrn" in DATA[lang]:
        corpus_train, corpus_heldout = train_test_split(
            DATA[lang]["fulltrn"],
            test_size=TST_RATIO,
            shuffle=False,
        )
        DATA[lang]["trn"] = corpus_train
        DATA[lang]["hld"] = corpus_heldout

In [13]:
# %rm -rf $REPO_NAME

### Data statistics

In [14]:
columns = ["Language", "Dataset", "#Instances"]
partition_keys = {
    "trn": "Training",
    "hld": "Held-out",
    "dev": "Development",
    "tst": "Test",
}
datasizes = []
for lang in SAMPLE_LANGS:
    for data_set in ("trn", "hld", "dev", "tst"):
        datasizes.append(
            [LANGS[lang], partition_keys[data_set], len(DATA[lang][data_set])]
        )

DATA_STATS = pd.DataFrame.from_records(datasizes, columns=columns)

display(Markdown(DATA_STATS.to_markdown(index=False)))

| Language   | Dataset     |   #Instances |
|:-----------|:------------|-------------:|
| English    | Training    |         2006 |
| English    | Held-out    |          354 |
| English    | Development |          590 |
| English    | Test        |         1967 |
| Spanish    | Training    |          482 |
| Spanish    | Held-out    |           86 |
| Spanish    | Development |            0 |
| Spanish    | Test        |          531 |
| Italian    | Training    |          504 |
| Italian    | Held-out    |           89 |
| Italian    | Development |            0 |
| Italian    | Test        |          100 |
| Danish     | Training    |          611 |
| Danish     | Held-out    |          108 |
| Danish     | Development |            0 |
| Danish     | Test        |          181 |

### EDA: Exploratory Data Analysis

In [17]:
DATA_PARTITIONS = ("trn", "hld", "dev", "tst")

DATA_STATS["All Vocab#"] = [
    len(
        {
            tok
            for data_part in DATA_PARTITIONS
            for item in DATA[lang][data_part]
            for sent in (item.src, item.tgt)
            for tok in sent.split()
        }
    )
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Src. Vocab#"] = [
    len({tok for item in DATA[lang][data_part] for tok in item.src.split()})
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Tgt. Vocab#"] = [
    len({tok for item in DATA[lang][data_part] for tok in item.tgt.split()})
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Oov. src. Vocab#"] = [
    len(
        {
            tok for item in DATA[lang][data_part] for tok in item.src.split()
        }.difference(
            {tok for item in DATA[lang]["trn"] for tok in item.src.split()}
        )
    )
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Oov. tgt. Vocab#"] = [
    len(
        {
            tok for item in DATA[lang][data_part] for tok in item.tgt.split()
        }.difference(
            {tok for item in DATA[lang]["trn"] for tok in item.tgt.split()}
        )
    )
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

# Percentage of out-of-vocabulary tokens in noisy sentences
DATA_STATS["Oov. src. Vocab%"] = (
    100 * DATA_STATS["Oov. src. Vocab#"] / DATA_STATS["All Vocab#"]
)
# Percentage of out-of-vocabulary tokens in normalized sentences 
DATA_STATS["Oov. tgt. Vocab%"] = (
    100 * DATA_STATS["Oov. tgt. Vocab#"] / DATA_STATS["All Vocab#"]
)
# replace all zero values with nan
DATA_STATS.replace(0, np.nan, inplace=True)

display(Markdown(DATA_STATS.to_markdown(index=False)))

| Language   | Dataset     |   #Instances |   All Vocab# |   Src. Vocab# |   Tgt. Vocab# |   Oov. src. Vocab# |   Oov. tgt. Vocab# |   Oov. src. Vocab% |   Oov. tgt. Vocab% |
|:-----------|:------------|-------------:|-------------:|--------------:|--------------:|-------------------:|-------------------:|-------------------:|-------------------:|
| English    | Training    |         2006 |        19461 |          9589 |          9049 |                nan |                nan |          nan       |          nan       |
| English    | Held-out    |          354 |        19461 |          2494 |          2369 |               1337 |               1255 |            6.87015 |            6.4488  |
| English    | Development |          590 |        19461 |          3906 |          3714 |               2280 |               2158 |           11.7157  |           11.0888  |
| English    | Test        |         1967 |        19461 |          9400 |          8867 |               6620 |               6256 |           34.0168  |           32.1463  |
| Spanish    | Training    |          482 |         5680 |          2693 |          2502 |                nan |                nan |          nan       |          nan       |
| Spanish    | Held-out    |           86 |         5680 |           614 |           585 |                369 |                329 |            6.49648 |            5.79225 |
| Spanish    | Development |          nan |         5680 |           nan |           nan |                nan |                nan |          nan       |          nan       |
| Spanish    | Test        |          531 |         5680 |          2991 |          2784 |               2332 |               2116 |           41.0563  |           37.2535  |
| Italian    | Training    |          504 |         5441 |          4046 |          3831 |                nan |                nan |          nan       |          nan       |
| Italian    | Held-out    |           89 |         5441 |          1038 |          1012 |                573 |                542 |           10.5312  |            9.9614  |
| Italian    | Development |          nan |         5441 |           nan |           nan |                nan |                nan |          nan       |          nan       |
| Italian    | Test        |          100 |         5441 |          1004 |           970 |                523 |                491 |            9.6122  |            9.02408 |
| Danish     | Training    |          611 |         6100 |          3454 |          3175 |                nan |                nan |          nan       |          nan       |
| Danish     | Held-out    |          108 |         6100 |          1993 |          1959 |               1404 |               1316 |           23.0164  |           21.5738  |
| Danish     | Development |          nan |         6100 |           nan |           nan |                nan |                nan |          nan       |          nan       |
| Danish     | Test        |          181 |         6100 |          1413 |          1348 |                839 |                773 |           13.7541  |           12.6721  |