<a href="https://colab.research.google.com/github/marcociav/lango/blob/master/model-building/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lango - Model Training

## Imports

### Packages

In [None]:
# TODO: retrain model and SAVE tokenizer too!
import pandas as pd
import tensorflow as tf
import pickle

from sklearn.model_selection import train_test_split
from utils import tokenize_and_sequence, LangoModel

### Data

In [None]:
%%time
df = pd.read_csv('data/sentences.csv', sep='\t', names=["id", "lan_code", "sentence"])

CPU times: total: 11.6 s
Wall time: 11.6 s


## Data Cleaning

In [None]:
DEBUG = False
if DEBUG:
  df = df.sample(frac=0.01, random_state=42)

In [None]:
df = df.drop(columns=['id'])

In [None]:
df.lan_code.unique()

array(['cmn', 'deu', 'rus', 'fra', 'eng', 'jpn', 'spa', 'ita', 'kor',
       'vie', 'nld', 'epo', 'por', 'tur', 'heb', 'hun', 'ell', 'ind',
       'ara', 'arz', 'fin', 'bul', 'yue', 'swe', 'ukr', 'bel', 'que',
       'ces', 'swh', 'nno', 'wuu', 'nob', 'zsm', 'est', 'kat', 'pol',
       'lat', 'urd', 'sqi', 'isl', 'fry', 'afr', 'ron', 'fao', 'san',
       'bre', 'tat', 'yid', 'uig', 'uzb', 'srp', 'qya', 'dan', 'pes', nan,
       'slk', 'eus', 'cycl', 'acm', 'tgl', 'lvs', 'kaz', 'hye', 'hin',
       'lit', 'ben', 'cat', 'bos', 'hrv', 'tha', 'orv', 'cha', 'mon',
       'lzh', 'scn', 'gle', 'mkd', 'slv', 'frm', 'glg', 'vol', 'ain',
       'jbo', 'tok', 'ina', 'nds', 'mal', 'tlh', 'roh', 'ltz', 'oss',
       'ido', 'gla', 'mlt', 'sco', 'ast', 'jav', 'oci', 'ile', 'ota',
       '\\N', 'xal', 'tel', 'sjn', 'nov', 'khm', 'tpi', 'ang', 'aze',
       'tgk', 'tuk', 'chv', 'hsb', 'dsb', 'bod', 'sme', 'cym', 'mri',
       'ksh', 'kmr', 'ewe', 'kab', 'ber', 'tpw', 'udm', 'lld', 'pms',
       'lad', 

In [None]:
df = df.loc[~((df.lan_code.isna()) | (df.lan_code == '\\N'))]

In [None]:
%%time
df.sentence = df.sentence.str.replace('[^\w\s]', '')



CPU times: total: 15.3 s
Wall time: 15.4 s


In [None]:
df = df[~(df.sentence == '')]

## Preprocessing

### Encoding Classes

In [None]:
lan_list = list(df.lan_code.unique())
num_classes = len(lan_list)

num_classes

404

In [None]:
lan_to_num = {
    lan: i 
    for i, lan in enumerate(lan_list)
}
num_to_lan = {
    i: lan for lan, i in lan_to_num.items()
}

In [None]:
df.lan_code = df.lan_code.map(lan_to_num).astype(int)

In [None]:
df

Unnamed: 0,lan_code,sentence
0,0,我們試試看
1,0,我该去睡觉了
2,0,你在干什麼啊
3,0,這是什麼啊
4,0,今天是６月１８号也是Muiriel的生日
...,...,...
10342205,6,Quiero este libro por favor
10342206,6,Los han hecho huir
10342207,6,Los botaron
10342208,6,Los hicieron correr


### Train Test Split

In [None]:
X = df.pop('sentence').values
y = df.copy().values.T[0]
del df
X

array(['我們試試看', '我该去睡觉了', '你在干什麼啊', ..., 'Los botaron',
       'Los hicieron correr', 'Los corrieron'], dtype=object)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.8,
    random_state=42,
    shuffle=True
)
del X, y

In [None]:
y_train

array([ 7,  1,  5, ...,  6, 82, 15])

### Sentences to Sequences

In [None]:
%%time
X_train, X_test, tok = tokenize_and_sequence(X_train, X_test)

CPU times: total: 4min 12s
Wall time: 4min 15s


## Model

In [None]:
model = LangoModel(num_classes=num_classes)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [None]:
%%time
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=256,
    epochs=2
)

Epoch 1/2
Epoch 2/2
CPU times: total: 2h 16min 31s
Wall time: 2h 18min 9s


<keras.callbacks.History at 0x22818d92bb0>

In [None]:
SAVE = True
v = 'v1'
if SAVE:
  model.save(f'models/lango_model_{v}')
  with open(f'models/tokenizer_{v}.pickle', 'wb') as f:
    pickle.dump(tok, f, protocol=pickle.HIGHEST_PROTOCOL)



INFO:tensorflow:Assets written to: models/lango_model_v1\assets


INFO:tensorflow:Assets written to: models/lango_model_v1\assets
