[WNUT21 Shared Task Website](http://noisy-text.github.io/2021/multi-lexnorm.html)

### Setup and Configuration

In [1]:
# We no longer clone the github repository. Instead this notebook
# is part of the repository itself
# !git clone https://github.com/pkolachi/lexicalnormalization

In [2]:
# %pip install --user -U pandas==1.1.5
# %pip install --user -U scikit-learn==0.22.2.post1
# %pip install --user -U sklearn-crfsuite

In [3]:
import itertools as it
import os.path
import random
from collections import Counter, defaultdict
from dataclasses import dataclass
from operator import itemgetter

In [4]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML, Markdown
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    cross_validate,
    train_test_split,
)



In [5]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [6]:
REPO_NAME = "multilexnorm"
LANGS = {
    "da": "Danish",
    "en": "English",
    "es": "Spanish",
    "hr": "Croatian",
    "iden": "Indonesian-English",
    "it": "Italian",
    "nl": "Dutch",
    "sl": "Slovenian",
    "sr": "Serbian",
    "tr": "Turkish",
    "trde": "Turkish-German",
}
SAMPLE_LANGS = ["en", "es", "it", "da"]
TST_RATIO = 0.15  # use 15% of training data as held-out data for evaluation
CVFOLDS = 4  # use 4-folds for cross-fold training throughout experiments

### Load Data

In [7]:
@dataclass
class AlignedParSent:
    src: str = ""
    tgt: str = ""
    alignments: tuple = ()
    slang: str = "en"
    tlang: str = slang

In [8]:
def block2sent(block):
    src_sent, tgt_sent = [], []
    alignments = []
    src_idx, tgt_idx = 0, 0
    for entry in block:
        src, tgt = entry.split("\t", 1)
        # get source information
        src_toks = src.split() if src.strip() else [] 
        src_tok_ids = range(src_idx, src_idx + len(src_toks))
        # get target information 
        if tgt.strip():
            # align this token to one-or-more tokens on target
            tgt_toks = tgt.split()
            tgt_tok_ids = range(tgt_idx, tgt_idx + len(tgt_toks))
        else:
            # align this token to previous token on target
            tgt_toks = []
            tgt_tok_ids = range(tgt_idx - 1, tgt_idx) 
        src_sent.extend(src_toks)
        tgt_sent.extend(tgt_toks)
        align = list(it.product(src_tok_ids, tgt_tok_ids))
        alignments.extend(align)
        src_idx = max(src_tok_ids) + 1
        tgt_idx = max(tgt_tok_ids) + 1 
    par_sent = AlignedParSent(
        " ".join(src_sent), " ".join(tgt_sent), tuple(alignments)
    )
    return par_sent

In [9]:
%%script false --no-raise-error
sample_block = """@ayu98srivastava	@ayu98srivastava
i	i
couldnt	couldn't
find	find
it	it
.	.
.	.
can	can
u	you
send	send
me	me
screen	screenshot
shot	""".split("\n")
print(sample_block)
print(block2sent(sample_block))

In [10]:
def load_data(inpfile):
    corpus = []
    with open(inpfile) as infile:
        # break lines into sentence blocks
        block = []
        for line in infile:
            if not line.strip() and len(block):
                corpus.append(block2sent(block))
                block = []
            elif not line.strip():
                continue
            else:
                block.append(line.strip("\n"))
        if len(block):
            corpus.append(block2sent(block))
            block = []
    return corpus

In [11]:
def is_proper(sent):
    return (
        len(sent.alignments) != 0
        and max(map(itemgetter(0), sent.alignments)) < len(sent.src.split())
        and max(map(itemgetter(1), sent.alignments)) < len(sent.tgt.split())
    )

In [12]:
DATA = defaultdict(lambda: defaultdict(list))
for lang in SAMPLE_LANGS:
    data_directory = os.path.join("..", REPO_NAME, "data", lang)
    train_file = os.path.join(data_directory, "train.norm")
    dev_file = os.path.join(data_directory, "dev.norm")
    test_file = os.path.join(data_directory, "test.norm")
    for dts, dtf in [
        ("fulltrn", train_file),
        ("dev", dev_file),
        ("tst", test_file),
    ]:
        if os.path.isdir(data_directory) and os.path.isfile(dtf):
            corpus_all = list(load_data(dtf))
            # sanitize corpus to make sure
            corpus_filtered = list(filter(is_proper, corpus_all))
            if len(corpus_all) != len(corpus_filtered):
                print(
                    f"Removed {len(corpus_all) - len(corpus_filtered)} sentences from {dtf}"
                )
            DATA[lang][dts] = corpus_filtered

In [13]:
for lang in SAMPLE_LANGS:
    if "fulltrn" in DATA[lang]:
        corpus_train, corpus_heldout = train_test_split(
            DATA[lang]["fulltrn"],
            test_size=TST_RATIO,
            shuffle=False,
        )
        DATA[lang]["trn"] = corpus_train
        DATA[lang]["hld"] = corpus_heldout

In [14]:
# %rm -rf $REPO_NAME

### Data statistics

In [15]:
columns = ["Language", "Dataset", "#Instances"]
partition_keys = {
    "trn": "Training",
    "hld": "Held-out",
    "dev": "Development",
    "tst": "Test",
}
datasizes = []
for lang in SAMPLE_LANGS:
    for data_set in ("trn", "hld", "dev", "tst"):
        datasizes.append(
            [LANGS[lang], partition_keys[data_set], len(DATA[lang][data_set])]
        )

DATA_STATS = pd.DataFrame.from_records(datasizes, columns=columns)

display(Markdown(DATA_STATS.to_markdown(index=False)))

| Language   | Dataset     |   #Instances |
|:-----------|:------------|-------------:|
| English    | Training    |         2006 |
| English    | Held-out    |          354 |
| English    | Development |          590 |
| English    | Test        |         1967 |
| Spanish    | Training    |          482 |
| Spanish    | Held-out    |           86 |
| Spanish    | Development |            0 |
| Spanish    | Test        |          531 |
| Italian    | Training    |          504 |
| Italian    | Held-out    |           89 |
| Italian    | Development |            0 |
| Italian    | Test        |          100 |
| Danish     | Training    |          611 |
| Danish     | Held-out    |          108 |
| Danish     | Development |            0 |
| Danish     | Test        |          181 |

### EDA: Exploratory Data Analysis

#### Data sizes and vocabulary

In [16]:
DATA_PARTITIONS = ("trn", "hld", "dev", "tst")

DATA_STATS["All Vocab#"] = [
    len(
        {
            tok
            for item in DATA[lang][data_part]
            for sent in (item.src, item.tgt)
            for tok in sent.split()
        }
    )
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Src. Vocab#"] = [
    len({tok for item in DATA[lang][data_part] for tok in item.src.split()})
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Tgt. Vocab#"] = [
    len({tok for item in DATA[lang][data_part] for tok in item.tgt.split()})
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Oov. src. Vocab#"] = [
    len(
        {
            tok for item in DATA[lang][data_part] for tok in item.src.split()
        }.difference(
            {tok for item in DATA[lang]["trn"] for tok in item.src.split()}
        )
    )
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

DATA_STATS["Oov. tgt. Vocab#"] = [
    len(
        {
            tok for item in DATA[lang][data_part] for tok in item.tgt.split()
        }.difference(
            {tok for item in DATA[lang]["trn"] for tok in item.tgt.split()}
        )
    )
    for lang in SAMPLE_LANGS
    for data_part in DATA_PARTITIONS
]

# Percentage of out-of-vocabulary tokens in noisy sentences
DATA_STATS["Oov. src. Vocab%"] = (
    100 * DATA_STATS["Oov. src. Vocab#"] / DATA_STATS["Src. Vocab#"]
)
# Percentage of out-of-vocabulary tokens in normalized sentences
DATA_STATS["Oov. tgt. Vocab%"] = (
    100 * DATA_STATS["Oov. tgt. Vocab#"] / DATA_STATS["Tgt. Vocab#"]
)
# replace all zero values with nan
DATA_STATS.replace(0, np.nan, inplace=True)

display(Markdown(DATA_STATS.to_markdown(index=False)))

| Language   | Dataset     |   #Instances |   All Vocab# |   Src. Vocab# |   Tgt. Vocab# |   Oov. src. Vocab# |   Oov. tgt. Vocab# |   Oov. src. Vocab% |   Oov. tgt. Vocab% |
|:-----------|:------------|-------------:|-------------:|--------------:|--------------:|-------------------:|-------------------:|-------------------:|-------------------:|
| English    | Training    |         2006 |         9757 |          9589 |          9049 |                nan |                nan |           nan      |           nan      |
| English    | Held-out    |          354 |         2570 |          2494 |          2369 |               1337 |               1255 |            53.6087 |            52.9759 |
| English    | Development |          590 |         4002 |          3906 |          3714 |               2280 |               2158 |            58.3717 |            58.1045 |
| English    | Test        |         1967 |         9562 |          9400 |          8867 |               6620 |               6256 |            70.4255 |            70.5537 |
| Spanish    | Training    |          482 |         2885 |          2693 |          2502 |                nan |                nan |           nan      |           nan      |
| Spanish    | Held-out    |           86 |          672 |           614 |           585 |                369 |                329 |            60.0977 |            56.2393 |
| Spanish    | Development |          nan |          nan |           nan |           nan |                nan |                nan |           nan      |           nan      |
| Spanish    | Test        |          531 |         3185 |          2991 |          2784 |               2332 |               2116 |            77.9672 |            76.0057 |
| Italian    | Training    |          504 |         4306 |          4046 |          3831 |                nan |                nan |           nan      |           nan      |
| Italian    | Held-out    |           89 |         1124 |          1038 |          1012 |                573 |                542 |            55.2023 |            53.5573 |
| Italian    | Development |          nan |          nan |           nan |           nan |                nan |                nan |           nan      |           nan      |
| Italian    | Test        |          100 |         1055 |          1004 |           970 |                523 |                491 |            52.0916 |            50.6186 |
| Danish     | Training    |          611 |         3884 |          3454 |          3175 |                nan |                nan |           nan      |           nan      |
| Danish     | Held-out    |          108 |         2053 |          1993 |          1959 |               1404 |               1316 |            70.4466 |            67.1771 |
| Danish     | Development |          nan |          nan |           nan |           nan |                nan |                nan |           nan      |           nan      |
| Danish     | Test        |          181 |         1537 |          1413 |          1348 |                839 |                773 |            59.3772 |            57.3442 |

#### Visualizing different edits

In [17]:
@dataclass(init=True, repr=True, frozen=True)
class AlignedPhrase:
    src: str = ""
    tgt: str = ""
    context: str = ""

In [18]:
def normalized_phrases_from_sentence(sent, context_len=3):
    src_tokens = sent.src.lower().split()
    tgt_tokens = sent.tgt.lower().split()
    alignments = defaultdict(set)
    alignments_inv = defaultdict(set)
    normalized_phrases = []
    for src_idx, tgt_idx in sent.alignments:
        alignments[src_idx].add(tgt_idx)
        alignments_inv[tgt_idx].add(src_idx)
    src_idx_cur = 0
    while src_idx_cur < len(src_tokens):
        tgt_idx_cur = alignments[src_idx_cur]
        src_idx_ext = set()
        # get source span of phrase
        for tidx in tgt_idx_cur:
            src_idx_ext = src_idx_ext.union(alignments_inv[tidx])
        tgt_idx_ext = set()
        # get target span of phrase 
        for sidx in src_idx_ext:
            tgt_idx_ext = tgt_idx_ext.union(alignments[sidx])
        src_idx_ext = sorted(src_idx_ext) 
        tgt_idx_ext = sorted(tgt_idx_ext)
        # get context span
        src_idx_prev = min(src_idx_ext, default=0) - context_len 
        src_idx_prev = 0 if src_idx_prev < 0 else src_idx_prev
        # get context
        if src_idx_prev < src_idx_cur:
            context = " ".join(src_tokens[sidx] for sidx in range(src_idx_prev, src_idx_cur))
        else:
            context = None 
        # get source phrase 
        src_phrase = " ".join(src_tokens[sidx] for sidx in src_idx_ext)
        # get target phrase 
        tgt_phrase = " ".join(tgt_tokens[tidx] for tidx in tgt_idx_ext)
        # add entry to table only if phrase has been modified
        if src_phrase != tgt_phrase:
            normalized_phrases.append(AlignedPhrase(src_phrase, tgt_phrase, context))
        src_idx_cur = max(src_idx_ext) + 1 
    return normalized_phrases 

In [19]:
def corpus2phrases(corpus):
    phrase_table = []
    for sentence in corpus:
        for phrase in normalized_phrases_from_sentence(sentence):
            phrase_table.append(phrase)
    return Counter(phrase_table)

In [20]:
columns = [
    "Language",
    "Dataset",
    "Context",
    "Noisy text",
    "Normalized text",
    "Counts",
]
EDITED_PHRASES = []
for lang in SAMPLE_LANGS:
    for data_set in ("trn", "hld", "dev"):
        phrase_table = corpus2phrases(DATA[lang][data_set])
        for phrase in phrase_table:
            EDITED_PHRASES.append(
                [
                    LANGS[lang],
                    partition_keys[data_set],
                    phrase.context,
                    phrase.src,
                    phrase.tgt,
                    phrase_table[phrase],
                ]
            )

In [21]:
EDITED_PHRASES = pd.DataFrame(EDITED_PHRASES, columns=columns)
# merge phrase entries for different context
# sort dictionary based on different fields
PHRASE_DICT = (
    EDITED_PHRASES
    .groupby(by=["Language", "Noisy text", "Normalized text"])["Counts"]
    .sum()
    .reset_index()
    .sort_values(by=["Counts"], ascending=False)
    .sort_values(by=["Language", "Normalized text"])
)
# display Markdown table is giving errors, so use HTML
display(HTML(PHRASE_DICT.to_html(index=False)))

Language,Noisy text,Normalized text,Counts
Danish,1.virusstreng,1. virusstreng,1
Danish,3.verdenskrig,3. verdenskrig,1
Danish,afhngig,afhængig,1
Danish,aktespterre,aktespterer,1
Danish,alrdig,aldrig,1
Danish,aldriq,aldrig,1
Danish,altsa,altså,3
Danish,alts,altså,1
Danish,ansøqt,ansøgt,1
Danish,arb.markedet,arbejsmarkedet,1


In [22]:
PHRASE_TABLE_FILE = os.path.join("..", "expt-data", "phrase-table.csv")
EDITED_PHRASES.to_csv(PHRASE_TABLE_FILE)

PHRASE_DICT_FILE = os.path.join("..", "expt-data", "phrase-dict.csv")
PHRASE_DICT.to_csv(PHRASE_DICT_FILE) 