# <center>Masakhane - Machine Translation for African Languages (Using JoeyNMT)</center>
## <leftalign> Author : Ari Ramkilowan</leftalign>
## <leftalign> Language Pair : English - isiXhosa</leftalign>
## <leftalign> Corpus : JW300 </leftalign>

<hr>

## Install JoeyNMT

In [3]:
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

Cloning into 'joeynmt'...
remote: Enumerating objects: 2184, done.[K
Receiving objects:   0% (1/2184)   Receiving objects:   1% (22/2184)   Receiving objects:   2% (44/2184)   Receiving objects:   3% (66/2184)   Receiving objects:   4% (88/2184)   Receiving objects:   5% (110/2184)   Receiving objects:   6% (132/2184)   Receiving objects:   7% (153/2184)   Receiving objects:   8% (175/2184)   Receiving objects:   9% (197/2184)   Receiving objects:  10% (219/2184)   Receiving objects:  11% (241/2184)   Receiving objects:  12% (263/2184)   Receiving objects:  13% (284/2184)   Receiving objects:  14% (306/2184)   Receiving objects:  15% (328/2184)   Receiving objects:  16% (350/2184)   Receiving objects:  17% (372/2184)   Receiving objects:  18% (394/2184)   Receiving objects:  19% (415/2184)   Receiving objects:  20% (437/2184)   Receiving objects:  21% (459/2184)   Receiving objects:  22% (481/2184)   Receiving objects:  23% (503/2184)   Receiving objects:  24% 

## Mount Google Drive

In [2]:
# If running on Google Colab - mount google drive

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [1]:
import torch

device_num = torch.cuda.current_device()
torch.cuda.get_device_name(device_num)
# torch.cuda.is_available()

'Tesla K80'

## Set your source and target languages

In [0]:
import os
import numpy as np
import pandas as pd

source_language = "en"
target_language = "xh" 
lc = True  # If True, lowercase the data.
seed = 42  # Random seed for shuffling.
tag = "baseline" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted
vocab_size=4000
corpus = "JW300"

os.environ["src"] = source_language # Sets them in bash as well, since we often use bash scripts
os.environ["tgt"] = target_language
os.environ["tag"] = tag
os.environ["vocab_size"] = str(vocab_size)
os.environ["corpus"] = corpus

In [5]:
# This will save it to a folder in our gdrive instead!
# !mkdir -p "/content/drive/My Drive/masakhane/$src-$trg-$tag"
gdrive_path = f"/content/drive/My Drive/masakhane/{source_language}-{target_language}-{tag}/"
os.environ["gdrive_path"] = gdrive_path
! echo $gdrive_path

/content/drive/My Drive/masakhane/en-xh-baseline/


In [7]:
# create path to joeynmt executables scripts, configs etc

joey_path = f"/content/joeynmt"
os.environ["joey_path"] = joey_path
! ls $joey_path/configs

iwslt14_deen_bpe.yaml		   transformer_reverse.yaml
iwslt_deen_bahdanau.yaml	   transformer_small.yaml
iwslt_envi_luong.yaml		   transformer_wmt17_ende.yaml
iwslt_envi_xnmt.yaml		   transformer_wmt17_lven.yaml
reverse.yaml			   wmt_ende_best.yaml
small.yaml			   wmt_ende_default.yaml
transformer_copy.yaml		   wmt_lven_best.yaml
transformer_iwslt14_deen_bpe.yaml  wmt_lven_default.yaml


## Download the global test set.

***(This changes from time to time, do this just to make sure you have the most recent version)***

In [0]:
! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en
  
! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$tgt.en
! mv test.en-$tgt.en test.en

! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$tgt.$tgt 
! mv test.en-$tgt.$tgt test.$tgt

In [10]:
# Read the test data to filter from train and dev splits.
# Store english portion in set for quick filtering checks.
en_test_sents = set()
filter_test_sents = "test.en-any.en"
j = 0
blanks = [] # sometimes blank lines creep innto test set - store which lines these are
with open(filter_test_sents) as f:
  for line in f:
    en_test_sents.add(line.strip())
    if len(line)<=1:
      blanks.append(j)
    j += 1
print('Loaded {} global test sentences to filter from the training/dev data.'.format(j))
print(f'There are {len(blanks)} blank lines in the test set')

Loaded 3571 global test sentences to filter from the training/dev data.
There are 0 blank lines in the test set


In [0]:
# filter test set

source_file = f"test.{source_language}"
target_file = f"test.{target_language}"

source = []
target = []

with open(source_file) as f:
  source = f.readlines()
            
with open(target_file) as f:
  target = f.readlines()

df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])

# remove trailing newline chars
df['source_sentence'] = df['source_sentence'].str.rstrip('" \n')
df['target_sentence'] = df['target_sentence'].str.rstrip('" \n')

# remove leading newline chars
df['source_sentence'] = df['source_sentence'].str.lstrip('"')
df['target_sentence'] = df['target_sentence'].str.lstrip('"')

# remove rows with really short sentences
df = df[~(df['source_sentence'].str.len() <8)] # remove rows wher esource text len <8 characters
df = df[~(df['target_sentence'].str.len() <8)] # remove rows wher esource text len <8 characters

# save the filtered test set
df['source_sentence'].to_csv(f'{source_file}', index=False, header=False, doublequote=False)
df['target_sentence'].to_csv(f'{target_file}', index=False, header=False, doublequote=False)

In [0]:
# copy test sets to gdrive
! cp test.$src "$gdrive_path"
! cp test.$tgt "$gdrive_path"
! cp test.$src-any.$src "$gdrive_path"

## Import prepared dataset

In [0]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
# This csv has extra columns added but no preprocessing done. all preprocessing should be captured in the NMT modelling notebook

input_file = f"{gdrive_path}/{source_language}-{target_language}-{corpus}-new.csv"
df = pd.read_csv(input_file)

In [16]:
df.head()

Unnamed: 0,source_sentence,target_sentence
0,How One Marriage Was Saved,Indlela Owasindiswa Ngayo Lo Mtshato
1,“ The application of the counsel in the book M...,Omnye umfundi onoxabiso nowaseMzantsi Afrika w...
2,"“ Chapter 5 , ‘ A Wife Who Is Dearly Loved , ’...","“ Isahluko 5 , esithi ‘ Umfazi Othandwa Kunene..."
3,I never imagined in my wildest dreams that I c...,Andizange ndicinge nasephupheni ukuba ndandise...
4,"Thank you very , very much .",Enkosi kakhulu .


In [17]:
# How many samples
size = len(df)
print(f"\n {size} samples in original text")
  


 876189 samples in original text


## Preprocess input data

In [18]:
## Preprocessing - Step 1 : Drop NaNs

df_pp = df.dropna()
df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping all NaNs")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 867276 entries, 0 to 876188
Data columns (total 2 columns):
source_sentence    867276 non-null object
target_sentence    867276 non-null object
dtypes: object(2)
memory usage: 332.4 MB

 8913(1.02 %) samples removed by dropping all NaNs


In [19]:
## Preprocessing - Step 2a : Drop all duplicates in Source (en) text

df_pp = df_pp.drop_duplicates(subset='source_sentence')
df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping Source sentence duplicates")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800546 entries, 0 to 876188
Data columns (total 2 columns):
source_sentence    800546 non-null object
target_sentence    800546 non-null object
dtypes: object(2)
memory usage: 356.2 MB

 66730(7.69 %) samples removed by dropping Source sentence duplicates


In [20]:
## Preprocessing - Step 2b : Drop all duplicates in Target (zu) text

df_pp = df_pp.drop_duplicates(subset='target_sentence')
df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping Target sentence duplicates")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795646 entries, 0 to 876188
Data columns (total 2 columns):
source_sentence    795646 non-null object
target_sentence    795646 non-null object
dtypes: object(2)
memory usage: 385.9 MB

 4900(0.61 %) samples removed by dropping Target sentence duplicates


In [0]:
##  Preprocessing - Step 3 : Remove all numeric entries

pattern = r"([0-9]*\.?[0-9]*)"  # catch integers and decimals
import re
r = re.compile(pattern)

In [22]:
%%time
##  Preprocessing - Step 3a : Remove all numeric entries - Source text

df_pp['source_sentence'] = df_pp['source_sentence'].str.replace(pattern,"")
df_pp['source_sentence'] = df_pp['source_sentence'].replace("",np.nan)

df_pp = df_pp.dropna()
df_pp.info(memory_usage='deep')
new_size = len(df_pp)

print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping nummeric entries from source text")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795478 entries, 0 to 876188
Data columns (total 2 columns):
source_sentence    795478 non-null object
target_sentence    795478 non-null object
dtypes: object(2)
memory usage: 347.2 MB

 168(0.02 %) samples removed by dropping nummeric entries from source text
CPU times: user 14.2 s, sys: 89.4 ms, total: 14.3 s
Wall time: 14.3 s


In [23]:
%%time
##  Preprocessing - Step 3b : Remove all numeric entries - Target text

df_pp['target_sentence'] = df_pp['target_sentence'].str.replace(r,"")
df_pp['target_sentence'] = df_pp['target_sentence'].replace("",np.nan)

df_pp = df_pp.dropna()
df_pp.info(memory_usage='deep')
new_size = len(df_pp)

print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping nummeric entries from target text")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795471 entries, 0 to 876188
Data columns (total 2 columns):
source_sentence    795471 non-null object
target_sentence    795471 non-null object
dtypes: object(2)
memory usage: 314.4 MB

 7(0.00 %) samples removed by dropping nummeric entries from target text
CPU times: user 14.7 s, sys: 95.4 ms, total: 14.8 s
Wall time: 14.8 s


#### Preprocessing - Step 4 :Get length of sentences and then drop really short sentences

In [24]:
%%time
# add length columns


df_pp['source_ch_len'] = df_pp['source_sentence'].str.len()
df_pp['source_w_len'] = [len(text.split()) for text in df_pp['source_sentence']] 
df_pp['target_ch_len'] = df_pp['target_sentence'].str.len()
df_pp['target_w_len'] = [len(text.split()) for text in df_pp['target_sentence']] 
df_pp.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795471 entries, 0 to 876188
Data columns (total 6 columns):
source_sentence    795471 non-null object
target_sentence    795471 non-null object
source_ch_len      795471 non-null int64
source_w_len       795471 non-null int64
target_ch_len      795471 non-null int64
target_w_len       795471 non-null int64
dtypes: int64(4), object(2)
memory usage: 338.7 MB
CPU times: user 3.91 s, sys: 31.2 ms, total: 3.94 s
Wall time: 3.96 s


In [0]:
# # character len distrn - source text - 
# df_pp['source_ch_len'].value_counts().sort_index()

In [0]:
# # character len distrn - target text
# df_pp['target_ch_len'].value_counts().sort_index()

In [0]:
## how many rows with source text <=2chars and what do they look like ?

In [0]:
# # how many single character sentences from source ?
# f"{df_pp['source_ch_len'].value_counts()[1]} single character source sentences"

# df_pp[df_pp['source_ch_len']<=1]

In [0]:
# # how many 2-character sentences from source ?
# f"{df_pp['source_ch_len'].value_counts()[2]} 2-character source sentences"

# df_pp[df_pp['source_ch_len']==2]

In [25]:
##  Preprocessing - Step 4a :  drop everything where the ch_len <=2 in source text

df_pp = df_pp[~(df_pp['source_ch_len'] <=2) ]

df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows with source sentences <= 2 characters")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795184 entries, 0 to 876188
Data columns (total 6 columns):
source_sentence    795184 non-null object
target_sentence    795184 non-null object
source_ch_len      795184 non-null int64
source_w_len       795184 non-null int64
target_ch_len      795184 non-null int64
target_w_len       795184 non-null int64
dtypes: int64(4), object(2)
memory usage: 338.7 MB

 287(0.04 %) samples removed by dropping rows with source sentences <= 2 characters


In [26]:
##  Preprocessing - Step 4b :  drop everything where the ch_len <=2 in target text

df_pp = df_pp[~(df_pp['target_ch_len'] <=2) ]

df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows with target sentences <= 2 characters")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795182 entries, 0 to 876188
Data columns (total 6 columns):
source_sentence    795182 non-null object
target_sentence    795182 non-null object
source_ch_len      795182 non-null int64
source_w_len       795182 non-null int64
target_ch_len      795182 non-null int64
target_w_len       795182 non-null int64
dtypes: int64(4), object(2)
memory usage: 338.7 MB

 2(0.00 %) samples removed by dropping rows with target sentences <= 2 characters


In [27]:
%%time
##  Preprocessing - Step 5 :  remove text from test set

with open(f"{gdrive_path}/test.en-any.en") as f:
    rows = f.readlines()
test_set_en = [row.strip() for row in rows]


df_pp = df_pp[~df_pp['source_sentence'].str.strip().isin(test_set_en)]

df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows from test set")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 794113 entries, 0 to 876188
Data columns (total 6 columns):
source_sentence    794113 non-null object
target_sentence    794113 non-null object
source_ch_len      794113 non-null int64
source_w_len       794113 non-null int64
target_ch_len      794113 non-null int64
target_w_len       794113 non-null int64
dtypes: int64(4), object(2)
memory usage: 338.3 MB

 1069(0.13 %) samples removed by dropping rows from test set
CPU times: user 2 s, sys: 49.9 ms, total: 2.05 s
Wall time: 2.07 s


In [28]:
%%time
##  Preprocessing - Step 6 :  remove the extra "
df_pp['source_sentence'] = df_pp['source_sentence'].map(lambda x: x.lstrip('"').rstrip('"'))
df_pp['target_sentence'] = df_pp['target_sentence'].map(lambda x: x.lstrip('"').rstrip('"'))


df_pp.info(memory_usage='deep')
new_size = len(df_pp)
print(f"\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows with extra quotes")
size = new_size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 794113 entries, 0 to 876188
Data columns (total 6 columns):
source_sentence    794113 non-null object
target_sentence    794113 non-null object
source_ch_len      794113 non-null int64
source_w_len       794113 non-null int64
target_ch_len      794113 non-null int64
target_w_len       794113 non-null int64
dtypes: int64(4), object(2)
memory usage: 338.3 MB

 0(0.00 %) samples removed by dropping rows with extra quotes
CPU times: user 2.09 s, sys: 5.8 ms, total: 2.1 s
Wall time: 2.11 s


## create dev df 

In [29]:
df_dev = df_pp[['source_sentence', 'target_sentence']]
# Shuffle the data to remove bias in dev set selection.
seed=42
df_dev = df_dev.sample(frac=1, random_state=seed).reset_index(drop=True)
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794113 entries, 0 to 794112
Data columns (total 2 columns):
source_sentence    794113 non-null object
target_sentence    794113 non-null object
dtypes: object(2)
memory usage: 12.1+ MB


## Create train test dev sets

In [30]:
%%time
# This section does the split between train/dev for the parallel corpora then saves them as separate files
# We use 1000 dev test and the given test set.

# Do the split between dev/train and create parallel corpora
num_dev_patterns = 1000

# Optional: lower case the corpora - this will make it easier to generalize, but without proper casing.
if lc:  # Julia: making lowercasing optional
    df_dev["source_sentence"] = df_dev["source_sentence"].str.lower()
    df_dev["target_sentence"] = df_dev["target_sentence"].str.lower()

# Julia: test sets are already generated
dev = df_dev.tail(num_dev_patterns) # Herman: Error in original
stripped = df_dev.drop(df_dev.tail(num_dev_patterns).index)

with open(f"{gdrive_path}/train."+source_language, "w") as src_file, open(f"{gdrive_path}/train."+target_language, "w") as tgt_file:
  for index, row in stripped.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    tgt_file.write(row["target_sentence"]+"\n")
    
with open(f"{gdrive_path}/dev."+source_language, "w") as src_file, open(f"{gdrive_path}/dev."+target_language, "w") as tgt_file:
  for index, row in dev.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    tgt_file.write(row["target_sentence"]+"\n")

CPU times: user 1min 19s, sys: 238 ms, total: 1min 19s
Wall time: 1min 20s


In [31]:
# Doublecheck the format below. There should be no extra quotation marks or weird characters.
! head "$gdrive_path/train.$src"
! echo "================================="
! head "$gdrive_path/dev.$src"
! echo "================================="
! head "$gdrive_path/test.$src"

negative feelings may likewise be aroused when the media spotlight racial conflicts , police brutality , and protest rallies or when they portray ethnic groups in a negative light 
after catching your breath , you move another lever back and forth , slowly inching your way across the other half 
the fear - inspiring day of jehovah
 “ my dream has come true ”
satan uses the spirit of the world to control people , but we can break free of its influence
he is recorded as saying : “ i am jehovah , that is my name  ”
some have taken training courses that have opened up job opportunities enabling them to engage in or resume full - time service 
what , though , does god’s word say about exercising parental authority ?
because god is a person , he also has a personality with likes and dislikes ​ — even feelings 
all in the congregation cooperated , including the children 
a prophecy in the bible book of revelation provides the answer 
• for what conditions and trials is satan responsible ?
to 

In [32]:
! head "$gdrive_path/train.$tgt"
! echo "================================="
! head "$gdrive_path/dev.$tgt"
! echo "================================="
! head "$gdrive_path/test.$tgt"

iimvakalelo ezingafanelekanga ngokufanayo zisenokubakho xa amajelo eendaba ebalaselisa ungquzulwano lobuhlanga , inkohlakalo yamapolisa , nokuqhankqalaza kwamaqela okanye xa kugxekwa isizwe esithile 
emva kokunqumama kancinane uthoba izibilini , ushukumisa enye intonga ende uyisa ngemva nangaphambili , uhamba kancinane ukuwela esinye isiqingatha 
imini eyoyikekayo kayehova
 “ umnqweno wam uzalisekile ”
usathana usebenzisa umoya wehlabathi ukulawula abantu , kodwa sinokuyiphepha impembelelo yawo
wathi : “ ndinguyehova  lilo elo igama lam  ”
bambi baye bafumana uqeqesho kwizifundo ezibavulele amathuba emisebenzi ebavumelayo ukuba bangenele okanye babuyele kwinkonzo yexesha elizeleyo 
noko ke , ithini ibhayibhile ngokusebenzisa igunya lobuzali ?
ngenxa yokuba uthixo engumntu , ukwanabo nobuntu , kwaye unezinto azithandayo nangazithandiyo — nkqu neemvakalelo unazo 
bonke ebandleni bafak ’ isandla kuquka nabantwana 
isiprofeto esikwincwadi yebhayibhile yesityhilelo sinikela impendulo 
• ziz

## Preprocessing the Data into Subword BPE Tokens

- One of the most powerful improvements for agglutinative languages (a feature of most Bantu languages) is using BPE tokenization [ (Sennrich, 2015) ](https://arxiv.org/abs/1508.07909).

- It was also shown that by optimizing the umber of BPE codes we significantly improve results for low-resourced languages [(Sennrich, 2019)](https://www.aclweb.org/anthology/P19-1021) [(Martinus, 2019)](https://arxiv.org/abs/1906.05685)

- Below we have the scripts for doing BPE tokenization of our data. We use 4000 tokens as recommended by [(Sennrich, 2019)](https://www.aclweb.org/anthology/P19-1021). You do not need to change anything. Simply running the below will be suitable. 

In [33]:
%%time
! subword-nmt learn-joint-bpe-and-vocab --input  "$gdrive_path"train.$src  "$gdrive_path"train.$tgt -s $vocab_size -o  "$gdrive_path"bpe.codes.$vocab_size --write-vocabulary  "$gdrive_path"vocab.$src  "$gdrive_path"vocab.$tgt

# Apply BPE splits to the train, development and test data.
! subword-nmt apply-bpe -c "$gdrive_path"bpe.codes.$vocab_size --vocabulary "$gdrive_path"vocab.$src < "$gdrive_path"train.$src > "$gdrive_path"train.bpe.$src
! subword-nmt apply-bpe -c "$gdrive_path"bpe.codes.$vocab_size --vocabulary "$gdrive_path"vocab.$tgt < "$gdrive_path"train.$tgt > "$gdrive_path"train.bpe.$tgt

! subword-nmt apply-bpe -c "$gdrive_path"bpe.codes.$vocab_size --vocabulary "$gdrive_path"vocab.$src < "$gdrive_path"dev.$src > "$gdrive_path"dev.bpe.$src
! subword-nmt apply-bpe -c "$gdrive_path"bpe.codes.$vocab_size --vocabulary "$gdrive_path"vocab.$tgt < "$gdrive_path"dev.$tgt > "$gdrive_path"dev.bpe.$tgt

! subword-nmt apply-bpe -c "$gdrive_path"bpe.codes.$vocab_size --vocabulary "$gdrive_path"vocab.$src < "$gdrive_path"test.$src > "$gdrive_path"test.bpe.$src
! subword-nmt apply-bpe -c "$gdrive_path"bpe.codes.$vocab_size --vocabulary "$gdrive_path"vocab.$tgt < "$gdrive_path"test.$tgt > "$gdrive_path"test.bpe.$tgt


CPU times: user 1.12 s, sys: 186 ms, total: 1.3 s
Wall time: 4min 57s


In [0]:
# Create that vocab using build_vocab
! sudo chmod 777 joeynmt/scripts/build_vocab.py
! joeynmt/scripts/build_vocab.py "$gdrive_path"train.bpe."$src" "$gdrive_path"train.bpe."$tgt" --output_path "$gdrive_path"vocab.txt

In [0]:
# Some output
! echo "BPE Xhosa Sentences"
! tail -n 5 "$gdrive_path"test.bpe.$tgt

! echo "Combined BPE Vocab"
! tail -n 10 "$gdrive_path"vocab.txt  # Herman


## Creating the JoeyNMT Config

JoeyNMT requires a yaml config. We provide a template below. We've also set a number of defaults with it, that you may play with!

- We used Transformer architecture 
- We set our dropout to reasonably high: 0.3 (recommended in  [(Sennrich, 2019)](https://www.aclweb.org/anthology/P19-1021))

Things worth playing with:
- The batch size (also recommended to change for low-resourced languages)
- The number of epochs (we've set it at 30 just so it runs in about an hour, for testing purposes)
- The decoder options (beam_size, alpha)
- Evaluation metrics (BLEU versus Crhf4)

In [8]:
name = '%s%s%s%s' % (source_language, target_language, str(vocab_size),tag)
name

'enxh4000baseline'

In [0]:
# Create this dir before we run for the first time so we store check points
# !mkdir -p "$gdrive_path/pretrained/$src$trg$vocab_size$tag/" # Herman

In [0]:
# This creates the config file for our JoeyNMT system. It might seem overwhelming so we've provided a couple of useful parameters you'll need to update
# (You can of course play with all the parameters if you'd like!)

name = '%s%s%s%s' % (source_language, target_language, str(vocab_size),tag)
gdrive_path = os.environ["gdrive_path"]

# Create the config
config = """
name: "{name}_transformer"

data:
    src: "{source_language}"
    trg: "{target_language}"
    train: "{gdrive_path}train.bpe"
    dev:   "{gdrive_path}dev.bpe"
    test:  "{gdrive_path}test.bpe"
    level: "bpe"
    lowercase: False
    max_sent_length: 100
    src_vocab: "{gdrive_path}vocab.txt"
    trg_vocab: "{gdrive_path}vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0

training:
    load_model: "{gdrive_path}pretrained/{name}/50000.ckpt"
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999] 
    scheduling: "plateau"           # TODO: try switching from plateau to Noam scheduling
    patience: 5                     # For plateau: decrease learning rate by decrease_factor if validation score has not improved for this many validation rounds.
    learning_rate_factor: 0.5       # factor for Noam scheduler (used with Transformer)
    learning_rate_warmup: 1000      # warmup steps for Noam scheduler (used with Transformer)
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0003
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 3600
    eval_batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "ppl"
    epochs: 20                     # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
    validation_freq: 1000          # TODO: Set to at least once per epoch.
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "models/{name}_transformer"
    overwrite: True               # TODO: Set to True if you want to overwrite possibly existing models. 
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4             # TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256   # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.3
        # typically ff_size = 4 x hidden_size
        hidden_size: 256         # TODO: Increase to 512 for larger data.
        ff_size: 1024            # TODO: Increase to 2048 for larger data.
        dropout: 0.4
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8              # TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256    # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.3
        # typically ff_size = 4 x hidden_size
        hidden_size: 256         # TODO: Increase to 512 for larger data.
        ff_size: 1024            # TODO: Increase to 2048 for larger data.
        dropout: 0.4
""".format(name=name,
           gdrive_path=os.environ["gdrive_path"],
           source_language=source_language,
           target_language=target_language
          )

with open("joeynmt/configs/transformer_{name}.yaml".format(name=name),'w') as f:
    f.write(config)

## Train the Model

In [10]:
! cd joeynmt/configs; ls
# copy config to gdrive
! cp joeynmt/configs/transformer_$src$tgt$vocab_size$tag.yaml "$gdrive_path/pretrained/$src$tgt$vocab_size$tag/"

iwslt14_deen_bpe.yaml		   transformer_reverse.yaml
iwslt_deen_bahdanau.yaml	   transformer_small.yaml
iwslt_envi_luong.yaml		   transformer_wmt17_ende.yaml
iwslt_envi_xnmt.yaml		   transformer_wmt17_lven.yaml
reverse.yaml			   wmt_ende_best.yaml
small.yaml			   wmt_ende_default.yaml
transformer_copy.yaml		   wmt_lven_best.yaml
transformer_enxh4000baseline.yaml  wmt_lven_default.yaml
transformer_iwslt14_deen_bpe.yaml


In [11]:
%%time
# Train the model
# You can press Ctrl-C to stop. And then run the next cell to save your checkpoints! 
! cd joeynmt; python3 -m joeynmt train configs/transformer_$src$tgt$vocab_size$tag.yaml

2019-11-13 06:03:38,559 Hello! This is Joey-NMT.
2019-11-13 06:03:40,110 Total params: 12169728
2019-11-13 06:03:40,112 Trainable parameters: ['decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.dec_layer_norm.bias', 'decoder.layers.0.dec_layer_norm.weight', 'decoder.layers.0.feed_forward.layer_norm.bias', 'decoder.layers.0.feed_forward.layer_norm.weight', 'decoder.layers.0.feed_forward.pwff_layer.0.bias', 'decoder.layers.0.feed_forward.pwff_layer.0.weight', 'decoder.layers.0.feed_forward.pwff_layer.3.bias', 'decoder.layers.0.feed_forward.pwff_layer.3.weight', 'decoder.layers.0.src_trg_att.k_layer.bias', 'decoder.layers.0.src_trg_att.k_layer.weight', 'decoder.layers.0.src_trg_att.output_layer.bias', 'decoder.layers.0.src_trg_att.output_layer.weight', 'decoder.layers.0.src_trg_att.q_layer.bias', 'decoder.layers.0.src_trg_att.q_layer.weight', 'decoder.layers.0.src_trg_att.v_layer.bias', 'decoder.layers.0.src_trg_att.v_layer.weight', 'decoder.layers.0.trg_trg_att.k_l

In [12]:
# Copy the created models from the notebook storage to google drive for persistant storage 
!cp  -r joeynmt/models/${src}${tgt}${vocab_size}${tag}_transformer/* "$gdrive_path""pretrained/$src$tgt$vocab_size$tag/"
!cp  joeynmt/models/${src}${tgt}${vocab_size}${tag}_transformer/best.ckpt "$gdrive_path""pretrained/$src$tgt$vocab_size$tag"

cp: cannot create symbolic link '/content/drive/My Drive/masakhane/en-xh-baseline/pretrained/enxh4000baseline/best.ckpt': Function not implemented


In [0]:
# copy across the config file
!cp  joeynmt/configs/transformer_${src}${tgt}${vocab_size}${tag}.yaml "$gdrive_path"

In [0]:
# Test our model
# ! cd joeynmt; python3 -m joeynmt test "$gdrive_path""pretrained/$src$tgt$vocab_size$tag/config.yaml"

In [14]:
# OR
! cd joeynmt; python3 -m joeynmt test "$gdrive_path""transformer_${src}${tgt}${vocab_size}${tag}.yaml"

2019-11-13 07:09:00,315 Hello! This is Joey-NMT.
2019-11-13 07:10:25,741  dev bleu:  19.92 [Beam search decoding with beam size = 5 and alpha = 1.0]
2019-11-13 07:12:59,403 test bleu:   6.00 [Beam search decoding with beam size = 5 and alpha = 1.0]


In [0]:
# Translate mode is mopre interactive but almsot the same as running in test mode
! cd joeynmt; python3 -m joeynmt translate "$gdrive_path""transformer_${src}${tgt}${vocab_size}${tag}.yaml"

In [15]:
# Output our validation accuracy
! cat "$gdrive_path/pretrained/${src}${tgt}${vocab_size}${tag}/validations.txt"

Steps: 51000	Loss: 44208.57031	PPL: 5.31123	bleu: 19.02948	LR: 0.00030000	*
Steps: 52000	Loss: 44154.86719	PPL: 5.30047	bleu: 19.11237	LR: 0.00030000	*
Steps: 53000	Loss: 43961.04297	PPL: 5.26180	bleu: 19.24662	LR: 0.00030000	*
Steps: 54000	Loss: 43777.07812	PPL: 5.22537	bleu: 19.41868	LR: 0.00030000	*
Steps: 55000	Loss: 43506.07031	PPL: 5.17215	bleu: 19.15417	LR: 0.00030000	*
Steps: 56000	Loss: 43403.72266	PPL: 5.15219	bleu: 19.36647	LR: 0.00030000	*
Steps: 57000	Loss: 43166.08203	PPL: 5.10616	bleu: 19.73976	LR: 0.00030000	*
Steps: 58000	Loss: 43249.83594	PPL: 5.12233	bleu: 19.38474	LR: 0.00030000	
