<a href="https://colab.research.google.com/github/pkolachi/lexicalnormalization/blob/master/exptnbs/LexicalNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[WNUT21 Shared Task Website](http://noisy-text.github.io/2021/multi-lexnorm.html)

### Setup and Configuration

In [1]:
!git clone https://github.com/pkolachi/lexicalnormalization
%pip install --user -U pandas==1.1.5
%pip install --user -U scikit-learn==0.22.2.post1
%pip install --user -U sklearn-crfsuite

fatal: destination path 'lexicalnormalization' already exists and is not an empty directory.
Requirement already up-to-date: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)
Requirement already up-to-date: scikit-learn==0.22.2.post1 in /usr/local/lib/python3.7/dist-packages (0.22.2.post1)
Requirement already up-to-date: sklearn-crfsuite in /root/.local/lib/python3.7/site-packages (0.3.6)


In [2]:
REPO_NAME = 'lexicalnormalization'
LANGS = {'da': 'Danish',
         'en': 'English',
         'es': 'Spanish',
         'hr': 'Croatian',
         'iden': 'Indonesian-English',
         'it': 'Italian',
         'nl': 'Dutch',
         'sl': 'Slovenian',
         'sr': 'Serbian',
         'tr': 'Turkish',
         'trde': 'Turkish-German',
         }
SMPLS = LANGS.keys()
EMPTY_LABEL = '+-#MERGE#-+' #''
PAD_LABEL   = '+-#DROP#-+'

### Load Data

In [3]:
from collections import defaultdict 
import itertools as it 
from operator import itemgetter
import os.path 
import pandas as pd

def load_data_v0(inpfile, empty_label=''): 
  with open(inpfile) as ins:
    sent = []
    for lne in ins:
      if not lne.strip() and len(sent):
        yield sent
        sent = []
      else:
        fields = tuple(lne.strip('\n').split('\t', 1))
        if len(fields) > 1 and not fields[1].strip() and empty_label:
          fields = (fields[0], empty_label)
        sent.append(fields)
    # this shouldn't be necessary if the files are correctly formatted
    # but if EOF is encountered without a blank line at the end
    if len(sent):
      yield sent
      sent = []

def load_data(inpfile, empty_label=''):
  with open(inpfile) as inf:
    # break lines into sentence blocks
    snb = (list(it.takewhile(lambda lne: lne.strip(), inf)) for _ in it.count(1))
    # deal with errors in file format especially turkish
    snc = it.dropwhile(lambda snt: len(snt) == 0 or
                       (len(snt) == 1 and not snt.strip()), 
                      snb)
    # terminate this infinite stream 
    snd = it.takewhile(lambda snt: len(snt)  > 0, snc)
    # deal with errors in file format where sentences are empty
    #snf = filter(lambda snt: not (len(snt) == 1 and snt[0].strip() == ''), snd)
    # split into fields
    crs = ([t.strip('\n').split('\t', 1) for t in s] for s in snd)
    if empty_label:
      crp = [[(tok[0], tok[1] if len(tok) > 1 and tok[1].strip() else empty_label)
              for tok in sent]
             for sent in crs]
    else:
      crp = list(crs)
    return crp

# remove sentences that do not follow the expected format
sanitize_crps = lambda sent: all(len(fields) == 2 for fields in sent)
# get input from tuple (raw sentences)
get_rawtokens = lambda sent: list(map(itemgetter(0), sent)) 
# get output/labels from tuple (normalized sentences)
get_nrmtokens = lambda sent: list(map(itemgetter(1), sent))

DATA = defaultdict(lambda: defaultdict(lambda: ([], [])))
for lang in SMPLS:
  datadir = os.path.join(REPO_NAME, 'data', lang)
  trnfile = os.path.join(datadir, 'train.norm')
  devfile = os.path.join(datadir, 'dev.norm')
  tstfile = os.path.join(datadir, 'test.norm')
  for dts, dtf in [('fulltrn', trnfile), ('dev', devfile), ('tst', tstfile)]:
    if os.path.isdir(datadir) and os.path.isfile(dtf):
      ocrp0 = list(load_data_v0(dtf, empty_label=EMPTY_LABEL))
      ocrp = list(load_data(dtf, empty_label=EMPTY_LABEL))
      if any(s1 != s2 for s1, s2 in zip(ocrp0, ocrp)):
        print('Corpus reader functions do not match')
      # sanitize corpus to make sure
      fcrp = list(filter(sanitize_crps, ocrp))
      if len(ocrp) != len(fcrp): print("Removed {0} sentences from {1}".format(len(ocrp)-len(fcrp), dtf))
      X = list(map(get_rawtokens, fcrp))  
      Y = list(map(get_nrmtokens, fcrp)) 
      print(dtf, len(ocrp), len(fcrp), len(ocrp0))
      DATA[lang][dts] = (X, Y)

lexicalnormalization/data/da/train.norm 207 207 207
lexicalnormalization/data/en/train.norm 2360 2360 2360
lexicalnormalization/data/en/dev.norm 590 590 590
lexicalnormalization/data/es/train.norm 568 568 568
lexicalnormalization/data/hr/train.norm 449 449 4762
lexicalnormalization/data/hr/dev.norm 1588 1588 1588
lexicalnormalization/data/iden/train.norm 495 495 495
lexicalnormalization/data/iden/dev.norm 165 165 165
lexicalnormalization/data/it/train.norm 593 593 593
Corpus reader functions do not match
lexicalnormalization/data/nl/train.norm 939 939 939
lexicalnormalization/data/nl/dev.norm 314 314 314
lexicalnormalization/data/sl/train.norm 4670 4670 4670
lexicalnormalization/data/sl/dev.norm 1557 1557 1557
lexicalnormalization/data/sr/train.norm 4138 4138 4138
lexicalnormalization/data/sr/dev.norm 1327 1327 1380
Corpus reader functions do not match
lexicalnormalization/data/tr/train.norm 570 570 571
lexicalnormalization/data/trde/train.norm 800 800 800


In [4]:
fls = ('lexicalnormalization/data/nl/train.norm', \
       'lexicalnormalization/data/tr/train.norm')
for f in fls:
  ocrp0 = list(load_data_v0(f, empty_label=EMPTY_LABEL))
  ocrp1 = list(load_data(f, empty_label=EMPTY_LABEL))
  diff1 = [(i,s) for i,s in enumerate(ocrp1) if s not in ocrp0]
  diff2 = [(i,s) for i,s in enumerate(ocrp0) if s not in ocrp1]
  for i,s in diff1:
    print(i, s)
  for i,s in diff2:
    print(i, s)

796 [('@DeJakke', '@DeJakke'), ('.', '.'), ('Thks', 'Thks'), ('.', '.'), ('Opmerking', 'Opmerking'), ('terecht', 'terecht'), ('.', '.'), ('kgebruikte', 'Ik gebruikte'), ('minder-validen', 'minder-validen'), ('omdat', 'omdat'), ('organisatoren', 'organisatoren'), ('zich', 'zich'), ('Somival', 'Somival'), ('noemen', 'noemen'), ('.', '.'), ('Sport', 'Sport'), ('en', 'en'), ('ontspanning', 'ontspanning'), ('minder-validen', 'minder-validen'), ('.', '+-#MERGE#-+')]
796 [('@DeJakke', '@DeJakke'), ('.', '.'), ('Thks', 'Thks'), ('.', '.'), ('Opmerking', 'Opmerking'), ('terecht', 'terecht'), ('.', '.'), ('kgebruikte', 'Ik gebruikte'), ('minder-validen', 'minder-validen'), ('omdat', 'omdat'), ('organisatoren', 'organisatoren'), ('zich', 'zich'), ('Somival', 'Somival'), ('noemen', 'noemen'), ('.', '.'), ('Sport', 'Sport'), ('en', 'en'), ('ontspanning', 'ontspanning'), ('minder-validen', 'minder-validen'), ('.',)]
0 [('',)]


In [5]:
#%rm -rf $REPO_NAME

### Data statistics

In [6]:
TST_RATIO = 0.15
from sklearn.model_selection import train_test_split 

for lang in SMPLS:
  if 'fulltrn' in DATA[lang]:
    trn_x, hld_x, trn_y, hld_y = train_test_split(DATA[lang]['fulltrn'][0], 
                                                  DATA[lang]['fulltrn'][1], 
                                                  test_size=TST_RATIO, 
                                                  random_state=0, 
                                                  shuffle=False)
    DATA[lang]['trn'] = (trn_x, trn_y)
    DATA[lang]['hld'] = (hld_x, hld_y)

In [7]:
columns = ['Language', 'Training', 'Held-out', 'Devel', 'Test']
datasizes = [[LANGS[lang]]+[len(DATA[lang][crp][0]) 
                     for crp in ('trn', 'hld', 'dev', 'tst')] 
             for lang in SMPLS]
datasizes = pd.DataFrame.from_records(datasizes, columns=columns)

datasizes

Unnamed: 0,Language,Training,Held-out,Devel,Test
0,Danish,175,32,0,0
1,English,2006,354,590,0
2,Spanish,482,86,0,0
3,Croatian,381,68,1588,0
4,Indonesian-English,420,75,165,0
5,Italian,504,89,0,0
6,Dutch,798,141,314,0
7,Slovenian,3969,701,1557,0
8,Serbian,3517,621,1327,0
9,Turkish,484,86,0,0


In [8]:
datasizes['AllToks#'] = [len(set(tok for dts in ('trn', 'hld', 'dev', 'tst')
                                 for typ in (0, 1)
                                 for sent in DATA[lang][dts][typ]
                                 for tok in sent)) for lang in SMPLS]

datasizes['Vocab#']  = [len(set(tok for dts in ('trn', 'hld', 'dev', 'tst')
                                for sent in DATA[lang][dts][0] for tok in sent))
                        for lang in SMPLS]
datasizes['Labels#'] = [len(set(tok for dts in ('trn', 'hld', 'dev', 'tst')
                                for sent in DATA[lang][dts][1] for tok in sent))
                        for lang in SMPLS]

datasizes['Trn. Vocab#'] = [len(set(tok for sent in DATA[lang]['trn'][0]
                                    for tok in sent)) for lang in SMPLS]
datasizes['Trn. Label#'] = [len(set(tok for sent in DATA[lang]['trn'][1]
                                    for tok in sent)) for lang in SMPLS]

datasizes['Ood. Vocab%'] = [len(set(tok for dts in ('hld', 'dev', 'tst')
                                    for sent in DATA[lang][dts][0]
                                    for tok in sent).difference(
                                        set(tok for sent in DATA[lang]['trn'][0]
                                            for tok in sent)
                                        )) for lang in SMPLS]
datasizes['Ood. Label%'] = [len(set(tok for dts in ('hld', 'dev', 'tst')
                                    for sent in DATA[lang][dts][1]
                                    for tok in sent).difference(
                                        set(tok for sent in DATA[lang]['trn'][0]
                                            for tok in sent)
                                        )) for lang in SMPLS]
datasizes['Ood. Vocab%'] = 100 * datasizes['Ood. Vocab%'] / datasizes['Vocab#']
datasizes['Ood. Label%'] = 100 * datasizes['Ood. Label%'] / datasizes['Labels#']

datasizes 

Unnamed: 0,Language,Training,Held-out,Devel,Test,AllToks#,Vocab#,Labels#,Trn. Vocab#,Trn. Label#,Ood. Vocab%,Ood. Label%
0,Danish,175,32,0,0,3421,3333,3280,2935,2886,11.941194,12.195122
1,English,2006,354,590,0,13437,13127,12521,9589,9130,26.952083,27.561696
2,Spanish,482,86,0,0,3285,3064,2833,2695,2504,12.043081,12.072008
3,Croatian,381,68,1588,0,9609,8886,8434,2110,2085,76.254783,75.492056
4,Indonesian-English,420,75,165,0,6185,5754,5284,4256,3887,26.034063,27.346707
5,Italian,504,89,0,0,4941,4619,4383,4046,3836,12.405283,12.616929
6,Dutch,798,141,314,0,7885,6651,5394,3591,2902,46.008119,49.536522
7,Slovenian,3969,701,1557,0,17673,15963,14501,11229,10276,29.65608,30.315151
8,Serbian,3517,621,1327,0,20385,18658,17443,13705,12912,26.546254,27.019435
9,Turkish,484,86,0,0,5837,4326,3992,3760,3501,13.08368,13.677355


### Sequence classification using PyTorch

##### Preprocessing

In this step, we convert the sequence of tokens into an embedding matrix. 
This step relies on tokenizers and pre-trained models from ``huggingface``.
This seperation of preprocessing should allow for training other classifiers 
than neural versions using ``scikit-learn`` or other packages.

In [9]:
for lang in SMPLS:
  # load tokenizer model from ``huggingface``
  for dts in DATA[lang]:
    for inp, out in zip(DATA[lang][dts][0], DATA[lang][dts][1]):
      tinp = []
      tout = []
      for intok, outok in zip(inp, out):
        pass

### Sequence classification using HMM models