<a href="https://colab.research.google.com/github/pkolachi/lexicalnormalization/blob/master/exptnbs/LexicalNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[WNUT21 Shared Task Website](http://noisy-text.github.io/2021/multi-lexnorm.html)

### Setup and Configuration

In [None]:
!git clone https://github.com/pkolachi/lexicalnormalization
%pip install --user -U pandas==1.1.5
%pip install --user -U scikit-learn==0.22.2.post1

fatal: destination path 'lexicalnormalization' already exists and is not an empty directory.
Requirement already up-to-date: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)
Requirement already up-to-date: scikit-learn==0.22.2.post1 in /usr/local/lib/python3.7/dist-packages (0.22.2.post1)


In [None]:
REPO_NAME = 'lexicalnormalization'
LANGS = {'da': 'Danish',
         'en': 'English',
         'es': 'Spanish',
         'hr': 'Croatian',
         'iden': 'Indonesian-English',
         'it': 'Italian',
         'nl': 'Dutch',
         'sl': 'Slovenian',
         'sr': 'Serbian',
         'tr': 'Turkish',
         'trde': 'Turkish-German',
         }
SMPLS = LANGS.keys()
EMPTY_LABEL = '+-#MERGE#-+' #''
PAD_LABEL   = '+-#DROP#-+'

### Load Data

In [None]:
from collections import defaultdict 
from operator import itemgetter
import os.path 
import pandas as pd

def load_data(inpfile, empty_label=''): 
  with open(inpfile) as ins:
    sent = []
    for lne in ins:
      if not lne.strip():
        yield sent
        sent = []
      else:
        fields = tuple(lne.strip('\n').split('\t', 1))
        if len(fields) < 2:
          fields = (fields, empty_label)
        elif not fields[1].strip() and empty_label:
          fields = (fields[0], empty_label)
        sent.append(fields)

sanitize_crps = lambda sent: all(len(fields) == 2 for fields in sent)
get_rawtokens = lambda sent: list(map(itemgetter(0), sent)) # load input (un-normalized sentences)
get_nrmtokens = lambda sent: list(map(itemgetter(1), sent)) # load output (normalized sentences)

DATA = defaultdict(lambda: defaultdict(lambda: ([], [])))
for lang in SMPLS:
  datadir = os.path.join(REPO_NAME, 'data', lang)
  trnfile = os.path.join(datadir, 'train.norm')
  devfile = os.path.join(datadir, 'dev.norm')
  tstfile = os.path.join(datadir, 'test.norm')
  for dts, dtf in [('fulltrn', trnfile), ('dev', devfile), ('tst', tstfile)]:
    if os.path.isdir(datadir) and os.path.isfile(dtf):
      ocrp = list(load_data(dtf, empty_label=EMPTY_LABEL))
      # sanitize corpus to make sure
      fcrp = list(filter(sanitize_crps, ocrp))
      if len(ocrp) != len(fcrp): print("Removed {0} sentences from {1}".format(len(ocrp)-len(fcrp), dtf))
      X = list(map(get_rawtokens, fcrp))  
      Y = list(map(get_nrmtokens, fcrp)) 
      DATA[lang][dts] = (X, Y)

In [None]:
TST_RATIO = 0.15
from sklearn.model_selection import train_test_split 

for lang in SMPLS:
  if 'fulltrn' in DATA[lang]:
    trn_x, hld_x, trn_y, hld_y = train_test_split(DATA[lang]['fulltrn'][0], 
                                                  DATA[lang]['fulltrn'][1], 
                                                  test_size=TST_RATIO, 
                                                  random_state=0, 
                                                  shuffle=False)
    DATA[lang]['trn'] = (trn_x, trn_y)
    DATA[lang]['hld'] = (hld_x, hld_y)

In [None]:
columns = ['Language', 'Training', 'Held-out', 'Development', 'Testing']
datasizes = [[LANGS[lang]]+[len(DATA[lang][crp][0]) 
                     for crp in ('trn', 'hld', 'dev', 'tst')] 
             for lang in SMPLS]
datasizes = pd.DataFrame.from_records(datasizes, columns=columns)

datasizes

Unnamed: 0,Language,Training,Held-out,Development,Testing
0,Danish,175,32,0,0
1,English,2006,354,590,0
2,Spanish,482,86,0,0
3,Croatian,4049,715,1588,0
4,Indonesian-English,420,75,165,0
5,Italian,504,89,0,0
6,Dutch,797,141,313,0
7,Slovenian,3969,701,1557,0
8,Serbian,3517,621,1381,0
9,Turkish,486,86,0,0


In [None]:
%rm -rf $REPO_NAME

### Sequence classification using PyTorch

##### Preprocessing
In this step, we convert the sequence of tokens into an embedding matrix. 
This step relies on tokenizers and pre-trained models from ``huggingface``.
This seperation of preprocessing should allow for training other classifiers 
than neural versions using ``sklearn`` or other packages.

In [None]:
for lang in SMPLS:
  # load tokenizer model from ``huggingface``
  for dts in DATA[lang]:
    for inp, out in zip(DATA[lang][dts][0], DATA[lang][dts][1]):
      tinp = []
      tout = []
      for intok, outok in zip(inp, out):
        pass

In [None]:
import torch
print(torch.__version__)

1.8.1+cu101


### Sequence classification using HMM models