<a href="https://colab.research.google.com/github/pkolachi/lexicalnormalization/blob/master/exptnbs/LexicalNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup and Configuration

In [1]:
!git clone https://github.com/pkolachi/lexicalnormalization
%pip install --user -U pandas
%pip install --user -U sklearn

fatal: destination path 'lexicalnormalization' already exists and is not an empty directory.
Requirement already up-to-date: pandas in /home/pkolachi/.local/lib/python3.8/site-packages (1.2.4)
Note: you may need to restart the kernel to use updated packages.
Requirement already up-to-date: sklearn in /home/pkolachi/.local/lib/python3.8/site-packages (0.0)
Note: you may need to restart the kernel to use updated packages.


In [2]:
REPO_NAME = 'lexicalnormalization'
LANGS = {'da': 'Danish',
         'en': 'English',
         'es': 'Spanish',
         'fr': 'French',
         'hr': 'Croatian',
         'iden': 'Indonesian-English',
         'it': 'Italian',
         'nl': 'Dutch',
         'sl': 'Slovenian',
         'sr': 'Serbian',
         'tr': 'Turkish',
         'trde': 'Turkish-German',
         }
SMPLS = ['en']

### Load Data

In [3]:
from collections import defaultdict 
from operator import itemgetter
import os.path 
import pandas as pd

def read_data(inpfile):
  with open(inpfile) as ins:
    sent = []
    for lne in ins:
      if not lne.strip():
        yield sent
        sent = []
      else:
        sent.append(lne.strip('\n').split('\t', 1))

DATA = defaultdict(lambda: defaultdict(lambda: ([], [])))
for lang in SMPLS:
  datadir = os.path.join(REPO_NAME, 'data', lang)
  trnfile = os.path.join(datadir, 'train.norm')
  devfile = os.path.join(datadir, 'dev.norm')
  tstfile = os.path.join(datadir, 'test.norm')
  if os.path.isdir(datadir) and os.path.isfile(trnfile):
    crp = list(read_data(trnfile))
    X = list(map(itemgetter(0), crp))  # load input (un-normalized sentences)
    Y = list(map(itemgetter(1), crp))  # load output (normalized sentences)
    DATA[lang]['fulltrn'] = (X, Y)
  if os.path.isdir(datadir) and os.path.isfile(devfile):
    crp = list(read_data(devfile))
    DATA[lang]['dev'] = (list(map(itemgetter(0), crp)), 
                         list(map(itemgetter(1), crp)))
  if os.path.isdir(datadir) and os.path.isfile(tstfile):
    crp = list(read_data(tstfile))
    DATA[lang]['tst'] = (list(map(itemgetter(0), crp)), 
                         list(map(itemgetter(1), crp)))

In [4]:
TST_RATIO = 0.15
from sklearn.model_selection import train_test_split 

for lang in SMPLS:
  if 'fulltrn' in DATA[lang]:
    trn_x, hld_x, trn_y, hld_y = train_test_split(DATA[lang]['fulltrn'][0], 
                                                  DATA[lang]['fulltrn'][1], 
                                                  test_size=TST_RATIO, 
                                                  random_state=0, 
                                                  shuffle=False)
    DATA[lang]['trn'] = (trn_x, trn_y)
    DATA[lang]['hld'] = (hld_x, hld_y)

In [5]:
columns = ['lang'] + ['Training', 'Held-out', 'Development', 'Testing']
datasizes = [[LANGS[lang]]+[len(DATA[lang][crp][0]) 
                     for crp in ('trn', 'hld', 'dev', 'tst')] 
             for lang in SMPLS]
datasizes = pd.DataFrame.from_records(datasizes, columns=columns)
datasizes.head()

Unnamed: 0,lang,Training,Held-out,Development,Testing
0,English,2006,354,590,0


In [6]:
%rm -rf $REPO_NAME

### Preprocessing