<a href="https://colab.research.google.com/github/pkolachi/lexicalnormalization/blob/master/exptnbs/LexicalNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup and Configuration

In [1]:
!git clone https://github.com/pkolachi/lexicalnormalization
%pip install --user -U pandas==1.1.5
%pip install --user -U scikit-learn==0.22.2.post1

Cloning into 'lexicalnormalization'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects:   1% (1/59)[Kremote: Counting objects:   3% (2/59)[Kremote: Counting objects:   5% (3/59)[Kremote: Counting objects:   6% (4/59)[Kremote: Counting objects:   8% (5/59)[Kremote: Counting objects:  10% (6/59)[Kremote: Counting objects:  11% (7/59)[Kremote: Counting objects:  13% (8/59)[Kremote: Counting objects:  15% (9/59)[Kremote: Counting objects:  16% (10/59)[Kremote: Counting objects:  18% (11/59)[Kremote: Counting objects:  20% (12/59)[Kremote: Counting objects:  22% (13/59)[Kremote: Counting objects:  23% (14/59)[Kremote: Counting objects:  25% (15/59)[Kremote: Counting objects:  27% (16/59)[Kremote: Counting objects:  28% (17/59)[Kremote: Counting objects:  30% (18/59)[Kremote: Counting objects:  32% (19/59)[Kremote: Counting objects:  33% (20/59)[Kremote: Counting objects:  35% (21/59)[Kremote: Counting objects:  37% (22/59)[Kremo

In [2]:
REPO_NAME = 'lexicalnormalization'
LANGS = {'da': 'Danish',
         'en': 'English',
         'es': 'Spanish',
         'hr': 'Croatian',
         'iden': 'Indonesian-English',
         'it': 'Italian',
         'nl': 'Dutch',
         'sl': 'Slovenian',
         'sr': 'Serbian',
         'tr': 'Turkish',
         'trde': 'Turkish-German',
         }
SMPLS = LANGS.keys()

### Load Data

In [3]:
from collections import defaultdict 
from operator import itemgetter
import os.path 
import pandas as pd

def read_data(inpfile):
  with open(inpfile) as ins:
    sent = []
    for lne in ins:
      if not lne.strip():
        yield sent
        sent = []
      else:
        sent.append(tuple(lne.strip('\n').split('\t', 1)))

sanitize_crp  = lambda sent: all(len(fields) == 2 for fields in sent)
get_rawtokens = lambda sent: list(map(itemgetter(0), sent)) # load input (un-normalized sentences)
get_nrmtokens = lambda sent: list(map(itemgetter(1), sent)) # load output (normalized sentences)

DATA = defaultdict(lambda: defaultdict(lambda: ([], [])))
for lang in SMPLS:
  datadir = os.path.join(REPO_NAME, 'data', lang)
  trnfile = os.path.join(datadir, 'train.norm')
  devfile = os.path.join(datadir, 'dev.norm')
  tstfile = os.path.join(datadir, 'test.norm')
  for dts, dtf in [('fulltrn', trnfile), ('dev', devfile), ('tst', tstfile)]:
    if os.path.isdir(datadir) and os.path.isfile(dtf):
      ocrp = list(read_data(dtf))
      # sanitize corpus to make sure
      fcrp = list(filter(sanitize_crp, ocrp))
      if len(ocrp) != len(fcrp): print("Removed {0} sentences from {1}".format(len(ocrp)-len(fcrp), dtf))
      X = list(map(get_rawtokens, fcrp))  
      Y = list(map(get_nrmtokens, fcrp)) 
      DATA[lang][dts] = (X, Y)

Removed 1 sentences from lexicalnormalization/data/nl/train.norm


In [4]:
TST_RATIO = 0.15
from sklearn.model_selection import train_test_split 

for lang in SMPLS:
  if 'fulltrn' in DATA[lang]:
    trn_x, hld_x, trn_y, hld_y = train_test_split(DATA[lang]['fulltrn'][0], 
                                                  DATA[lang]['fulltrn'][1], 
                                                  test_size=TST_RATIO, 
                                                  random_state=0, 
                                                  shuffle=False)
    DATA[lang]['trn'] = (trn_x, trn_y)
    DATA[lang]['hld'] = (hld_x, hld_y)

In [5]:
columns = ['Language', 'Training', 'Held-out', 'Development', 'Testing']
datasizes = [[LANGS[lang]]+[len(DATA[lang][crp][0]) 
                     for crp in ('trn', 'hld', 'dev', 'tst')] 
             for lang in SMPLS]
datasizes = pd.DataFrame.from_records(datasizes, columns=columns)

datasizes

Unnamed: 0,lang,Training,Held-out,Development,Testing
0,Danish,175,32,0,0
1,English,2006,354,590,0
2,Spanish,482,86,0,0
3,French,0,0,0,0
4,Croatian,4049,715,1588,0
5,Indonesian-English,420,75,165,0
6,Italian,504,89,0,0
7,Dutch,796,141,313,0
8,Slovenian,3969,701,1557,0
9,Serbian,3517,621,1381,0


In [6]:
%rm -rf $REPO_NAME

### Preprocessing

### Sequence classification using HMM models