<a href="https://colab.research.google.com/github/pkolachi/lexicalnormalization/blob/master/exptnbs/LexicalNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[WNUT21 Shared Task Website](http://noisy-text.github.io/2021/multi-lexnorm.html)

### Setup and Configuration

In [1]:
!git clone https://github.com/pkolachi/lexicalnormalization
%pip install --user -U pandas==1.1.5
%pip install --user -U scikit-learn==0.22.2.post1

Cloning into 'lexicalnormalization'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 80 (delta 10), reused 54 (delta 3), pack-reused 0[K
Unpacking objects: 100% (80/80), done.
Requirement already up-to-date: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)
Requirement already up-to-date: scikit-learn==0.22.2.post1 in /usr/local/lib/python3.7/dist-packages (0.22.2.post1)


In [2]:
REPO_NAME = 'lexicalnormalization'
LANGS = {'da': 'Danish',
         'en': 'English',
         'es': 'Spanish',
         'hr': 'Croatian',
         'iden': 'Indonesian-English',
         'it': 'Italian',
         'nl': 'Dutch',
         'sl': 'Slovenian',
         'sr': 'Serbian',
         'tr': 'Turkish',
         'trde': 'Turkish-German',
         }
SMPLS = LANGS.keys()
EMPTY_LABEL = '+-#MERGE#-+' #''
PAD_LABEL   = '+-#DROP#-+'

### Load Data

In [3]:
from collections import defaultdict 
from operator import itemgetter
import os.path 
import pandas as pd

def load_data(inpfile, empty_label=''): 
  with open(inpfile) as ins:
    sent = []
    for lne in ins:
      if not lne.strip():
        yield sent
        sent = []
      else:
        fields = tuple(lne.strip('\n').split('\t', 1))
        if len(fields) > 1 and not fields[1].strip() and empty_label:
          fields = (fields[0], empty_label)
        sent.append(fields)
    # this shouldn't be necessary if the files are correctly formatted
    # but if EOF is encountered without a blank line at the end
    if len(sent):
      yield sent
      sent = []

# remove sentences that do not follow the expected format
sanitize_crps = lambda sent: all(len(fields) == 2 for fields in sent)
# get input from tuple (raw sentences)
get_rawtokens = lambda sent: list(map(itemgetter(0), sent)) 
# get output/labels from tuple (normalized sentences)
get_nrmtokens = lambda sent: list(map(itemgetter(1), sent))

DATA = defaultdict(lambda: defaultdict(lambda: ([], [])))
for lang in SMPLS:
  datadir = os.path.join(REPO_NAME, 'data', lang)
  trnfile = os.path.join(datadir, 'train.norm')
  devfile = os.path.join(datadir, 'dev.norm')
  tstfile = os.path.join(datadir, 'test.norm')
  for dts, dtf in [('fulltrn', trnfile), ('dev', devfile), ('tst', tstfile)]:
    if os.path.isdir(datadir) and os.path.isfile(dtf):
      ocrp = list(load_data(dtf, empty_label=EMPTY_LABEL))
      # sanitize corpus to make sure
      fcrp = list(filter(sanitize_crps, ocrp))
      if len(ocrp) != len(fcrp): print("Removed {0} sentences from {1}".format(len(ocrp)-len(fcrp), dtf))
      X = list(map(get_rawtokens, fcrp))  
      Y = list(map(get_nrmtokens, fcrp)) 
      DATA[lang][dts] = (X, Y)

Removed 1 sentences from lexicalnormalization/data/nl/train.norm


In [4]:
%rm -rf $REPO_NAME

### Data statistics

In [5]:
TST_RATIO = 0.15
from sklearn.model_selection import train_test_split 

for lang in SMPLS:
  if 'fulltrn' in DATA[lang]:
    trn_x, hld_x, trn_y, hld_y = train_test_split(DATA[lang]['fulltrn'][0], 
                                                  DATA[lang]['fulltrn'][1], 
                                                  test_size=TST_RATIO, 
                                                  random_state=0, 
                                                  shuffle=False)
    DATA[lang]['trn'] = (trn_x, trn_y)
    DATA[lang]['hld'] = (hld_x, hld_y)

In [6]:
columns = ['Language', 'Training', 'Held-out', 'Devel', 'Test']
datasizes = [[LANGS[lang]]+[len(DATA[lang][crp][0]) 
                     for crp in ('trn', 'hld', 'dev', 'tst')] 
             for lang in SMPLS]
datasizes = pd.DataFrame.from_records(datasizes, columns=columns)

datasizes

Unnamed: 0,Language,Training,Held-out,Devel,Test
0,Danish,175,32,0,0
1,English,2006,354,590,0
2,Spanish,482,86,0,0
3,Croatian,4049,715,1588,0
4,Indonesian-English,420,75,165,0
5,Italian,504,89,0,0
6,Dutch,797,141,314,0
7,Slovenian,3969,701,1557,0
8,Serbian,3517,621,1381,0
9,Turkish,486,86,0,0


In [7]:
datasizes['AllToks#'] = [len(set(tok for dts in ('trn', 'hld', 'dev', 'tst')
                                 for typ in (0, 1)
                                 for sent in DATA[lang][dts][typ]
                                 for tok in sent)) for lang in SMPLS]

datasizes['Vocab#']  = [len(set(tok for dts in ('trn', 'hld', 'dev', 'tst')
                                for sent in DATA[lang][dts][0] for tok in sent))
                        for lang in SMPLS]
datasizes['Labels#'] = [len(set(tok for dts in ('trn', 'hld', 'dev', 'tst')
                                for sent in DATA[lang][dts][1] for tok in sent))
                        for lang in SMPLS]

datasizes['Trn. Vocab#'] = [len(set(tok for sent in DATA[lang]['trn'][0]
                                    for tok in sent)) for lang in SMPLS]
datasizes['Trn. Label#'] = [len(set(tok for sent in DATA[lang]['trn'][1]
                                    for tok in sent)) for lang in SMPLS]

datasizes['Ood. Vocab%'] = [len(set(tok for dts in ('hld', 'dev', 'tst')
                                    for sent in DATA[lang][dts][0]
                                    for tok in sent).difference(
                                        set(tok for sent in DATA[lang]['trn'][0]
                                            for tok in sent)
                                        )) for lang in SMPLS]
datasizes['Ood. Label%'] = [len(set(tok for dts in ('hld', 'dev', 'tst')
                                    for sent in DATA[lang][dts][1]
                                    for tok in sent).difference(
                                        set(tok for sent in DATA[lang]['trn'][0]
                                            for tok in sent)
                                        )) for lang in SMPLS]
datasizes['Ood. Vocab%'] = 100 * datasizes['Ood. Vocab%'] / datasizes['Vocab#']
datasizes['Ood. Label%'] = 100 * datasizes['Ood. Label%'] / datasizes['Labels#']

datasizes 

Unnamed: 0,Language,Training,Held-out,Devel,Test,AllToks#,Vocab#,Labels#,Trn. Vocab#,Trn. Label#,Ood. Vocab%,Ood. Label%
0,Danish,175,32,0,0,3421,3333,3280,2935,2886,11.941194,12.195122
1,English,2006,354,590,0,13437,13127,12521,9589,9130,26.952083,27.561696
2,Spanish,482,86,0,0,3285,3064,2833,2695,2504,12.043081,12.072008
3,Croatian,4049,715,1588,0,23265,21769,20371,15146,14365,30.423997,30.091797
4,Indonesian-English,420,75,165,0,6185,5754,5284,4256,3887,26.034063,27.346707
5,Italian,504,89,0,0,4941,4619,4383,4046,3836,12.405283,12.616929
6,Dutch,797,141,314,0,7875,6642,5385,3581,2892,46.085516,49.637883
7,Slovenian,3969,701,1557,0,17673,15963,14501,11229,10276,29.65608,30.315151
8,Serbian,3517,621,1381,0,20521,18777,17551,13705,12912,27.01177,27.491311
9,Turkish,486,86,0,0,5837,4326,3992,3760,3501,13.08368,13.677355


### Sequence classification using PyTorch

##### Preprocessing
In this step, we convert the sequence of tokens into an embedding matrix. 
This step relies on tokenizers and pre-trained models from ``huggingface``.
This seperation of preprocessing should allow for training other classifiers 
than neural versions using ``sklearn`` or other packages.

In [8]:
for lang in SMPLS:
  # load tokenizer model from ``huggingface``
  for dts in DATA[lang]:
    for inp, out in zip(DATA[lang][dts][0], DATA[lang][dts][1]):
      tinp = []
      tout = []
      for intok, outok in zip(inp, out):
        pass

In [9]:
import torch
print(torch.__version__)

1.8.1+cu101


### Sequence classification using HMM models