## Import data 

In [None]:
# connecting to google collab
# https://joeynmt.readthedocs.io/en/latest/py-modindex.html
from google.colab import drive
drive.mount('/content/drive')

## Pre-processing

In [None]:
import os
source_langage = 'fr'
target_langage = 'en'
lc = False # lowercase if True
seed = 42 # random seed for shuffling


os.environ['src'] = source_langage
os.environ['tgt'] = target_langage

os.environ['gdrive_path'] = '/content/drive/MyDrive/Data' # path of files

In [None]:
source_file =  '/content/drive/MyDrive/Data/file.src'
target_file = '/content/drive/MyDrive/Data/file.tgt'

! wc -l $source_file
! wc -l $target_file 

!grep '^$' $source_file | wc -l # blank line in source
!grep '^$' $target_file | wc -l # blank line in target

In [None]:
# remove blank lines
! mkdir  "$gdrive_path/clean"

f1 = open('/content/drive/MyDrive/Data/clean/file.src','w')
f2 = open('/content/drive/MyDrive/Data/clean/file.tgt','w')

with open(source_file) as fr, open(target_file) as en:
  for source, target in zip(fr.readlines(), en.readlines()):
    if source != '\n' and target != '\n':
      f1.write(f'{source}')
      f2.write(f'{target}')
f1.close()
f2.close()
source_file = "/content/drive/MyDrive/Data/clean/file.src"
target_file = "/content/drive/MyDrive/Data/clean/file.tgt"

! wc -l $source_file
! wc -l $target_file

!grep '^$' $source_file | wc -l # blank line in source
!grep '^$' $target_file | wc -l # blank line in target

## Tokenization

In [None]:
! mkdir "$gdrive_path/token"
token_directory = "/content/drive/MyDrive/Data/token/"

### Sacremoses

In [None]:
! pip install sacremoses

tok_source_file = token_directory+"file.src"
tok_target_file = token_directory+"file.tgt"

# Tokenize the source
! sacremoses -l $source_langage tokenize < $source_file > $tok_source_file
# Tokenize the target
! sacremoses -l $target_langage tokenize < $target_file > $tok_target_file

! head -n 3 $source_file*
! head -n 3 $target_file*

source_file = tok_source_file
target_file = tok_target_file

### nltk

In [None]:
import numpy as np
import nltk
nltk.download('punkt')

tok_source_file = token_directory+"file.src"
tok_target_file = token_directory+"file.tgt"


def tokenization(file_name, name):
	output = open(name, 'w')
	with open(file_name) as f:
		for line in f.readlines():
			tokens = nltk.word_tokenize(line)
			for word in tokens:
				output.write(f'{word} ') 
			output.write('\n')
	output.close()
	
tokenization(source_file, tok_source_file)
tokenization(target_file,tok_target_file)

! head -n 3 $tok_source_file*
! head -n 3 $tok_target_file*

source_file =  tok_source_file
target_file = tok_target_file

## Create train, dev and test 

In [None]:
# Create trainset and testset

import pandas as pd

source_file =  '/content/drive/MyDrive/Data/token/file.src'
target_file = '/content/drive/MyDrive/Data/token/file.tgt'
max = 500_000 # Numbrer of lines for training
source = []
target = []
source_test = []   
target_test = []
skip_lines = [] # line numbers to skip to the source and the target
  
with open(source_file) as f:
  for i, line in enumerate(f):
    if i == max:
      break  
    # split 20% of training into test
    if i % 5 == 0:
      source_test.append(line.strip())
      skip_lines.append(i) 
    else:
      source.append(line.strip())   
with open(target_file) as f:
  for i, line in enumerate(f):
    if i == max: 
      break  
    if i not in skip_lines:    
      target.append(line.strip())
    else:
      target_test.append(line.strip())

print(f'Number of lines of the corpus {max}, Number of lines of the test {len(skip_lines)}')

df = pd.DataFrame(zip(source, target), columns= ['source_sentence', 'target_sentence'])
test = pd.DataFrame(zip(source_test, target_test), columns=['source_sentence', 'target_sentence'])

print('train...')
print(df.head(3))
print('test...')  
print(test.head(3))  

In [None]:
# Drop duplicate translations
df_pp = df.drop_duplicates()

# Drop confliting translations
df_pp.drop_duplicates(subset='source_sentence', inplace=True)
df_pp.drop_duplicates(subset='target_sentence', inplace=True)

#shuffle the data to remove bias in dev set selection
df_pp = df_pp.sample(frac = 1, random_state = seed).reset_index(drop = True)

In [None]:
# split train and dev

import csv

! mkdir "$gdrive_path/train"
! mkdir "$gdrive_path/test"
! mkdir "$gdrive_path/dev"

train_src = "/content/drive/MyDrive/Data/train/train.src"
train_tgt = "/content/drive/MyDrive/Data/train/train.tgt"
test_src = "/content/drive/MyDrive/Data/test/test.src"
test_tgt = "/content/drive/MyDrive/Data/test/test.tgt"
dev_src = "/content/drive/MyDrive/Data/dev/dev.src"
dev_tgt = "/content/drive/MyDrive/Data/dev/dev.tgt"

num_dev_patterns = 50_000

if lc: # lowercase
  df_pp['source_sentence'] = df_pp['source_sentence'].str.lower()
  df_pp['target_sentence'] = df_pp['target_sentence'].str.lower()
  test['source_sentence'] = test['source_sentence'].str.lower()
  test['target_sentence'] = test['target_sentence'].str.lower()

dev = df_pp.tail(num_dev_patterns)
stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index)

with open(train_src, 'w') as src_file, open(train_tgt,'w') as trg_file:
  for index, row in stripped.iterrows():
    src_file.write(row['source_sentence']+'\n')
    trg_file.write(row['target_sentence']+'\n')

with open(dev_src, 'w') as src_file, open(dev_tgt,'w') as trg_file:
  for index, row in dev.iterrows():
    src_file.write(row['source_sentence']+'\n')
    trg_file.write(row['target_sentence']+'\n')

with open(test_src, 'w') as src_file, open(test_tgt,'w') as trg_file:
  for index, row in test.iterrows():
    src_file.write(row['source_sentence']+'\n')
    trg_file.write(row['target_sentence']+'\n')
  
! head /content/drive/MyDrive/Data/train/*
! head /content/drive/MyDrive/Data/dev/*
! head /content/drive/MyDrive/Data/test/*

## Install joeynmt

In [None]:
# Install joeyNMT
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

In [None]:
# change cuda version
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

## BPE

### subwordnmt

In [None]:
# Usually, NMT would tokenize by words. However, using BPE boosts the performance
# Byte pair encoding: decrease size of memory(large vocabulary to represent word pretty well)
# subword NMT
!pip install subword-nmt
from os import path
os.environ['src'] = source_langage
os.environ['tgt'] = target_langage
! mkdir "$gdrive_path/subwordnmt"
vocab_src = "/content/drive/MyDrive/Data/subwordnmt/vocab.src"
vocab_tgt = "/content/drive/MyDrive/Data/subwordnmt/vocab.tgt"

train_bpe_src = '/content/drive/MyDrive/Data/subwordnmt/train.src'
train_bpe_tgt = '/content/drive/MyDrive/Data/subwordnmt/train.tgt'
test_bpe_src = '/content/drive/MyDrive/Data/subwordnmt/test.src'
test_bpe_tgt = '/content/drive/MyDrive/Data/subwordnmt/test.tgt'
dev_bpe_src = '/content/drive/MyDrive/Data/subwordnmt/dev.src'
dev_bpe_tgt = '/content/drive/MyDrive/Data/subwordnmt/dev.tgt'
# Learn BPE on the training data
! subword-nmt learn-joint-bpe-and-vocab --input $train_src $train_tgt -s 32000 -o bpe.codes --write-vocabulary $vocab_src $vocab_tgt

# Apply bpe on train, dev and test
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_src < $train_src > $train_bpe_src
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_tgt < $train_tgt > $train_bpe_tgt

! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_src < $dev_src > $dev_bpe_src
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_tgt < $dev_tgt > $dev_bpe_tgt

! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_src < $test_src > $test_bpe_src
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_tgt < $test_tgt > $test_bpe_tgt


# somme output
! echo "BPE sentences source"
! head -n 5 $train_bpe_src
! echo "BPE sentences target"
! head -n 5 $train_bpe_tgt

### sentencepiece

In [None]:
! pip install sentencepiece
! sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev

!git clone https://github.com/google/sentencepiece.git ; cd sentencepiece ; mkdir build ; cd build; cmake .. ; make -j $(nproc) ; sudo make install ; sudo ldconfig -v

In [None]:
from os import path
os.environ['src'] = source_langage
os.environ['tgt'] = target_langage
! mkdir "$gdrive_path/sentencepiece"
vocab_src = "/content/drive/MyDrive/Data/sentencepiece/vocab.src"
vocab_tgt = "/content/drive/MyDrive/Data/sentencepiece/vocab.tgt"

train_src = "/content/drive/MyDrive/Data/train/train.src"
train_tgt = "/content/drive/MyDrive/Data/train/train.tgt"
test_src = "/content/drive/MyDrive/Data/test/test.src"
test_tgt = "/content/drive/MyDrive/Data/test/test.tgt"
dev_src = "/content/drive/MyDrive/Data/dev/dev.src"
dev_tgt = "/content/drive/MyDrive/Data/dev/dev.tgt"


train_bpe_src = '/content/drive/MyDrive/Data/sentencepiece/train.src'
train_bpe_tgt = '/content/drive/MyDrive/Data/sentencepiece/train.tgt'
test_bpe_src = '/content/drive/MyDrive/Data/sentencepiece/test.src'
test_bpe_tgt = '/content/drive/MyDrive/Data/sentencepiece/test.tgt'
dev_bpe_src = '/content/drive/MyDrive/Data/sentencepiece/dev.src'
dev_bpe_tgt = '/content/drive/MyDrive/Data/sentencepiece/dev.tgt'
os.environ['data_path'] = path.join('joeynmt', 'data', source_langage + target_langage)

# Train sentencepiece model in source file (src.model and src.vocab are generated)
! spm_train --input=$train_src  --model_prefix=src --vocab_size=32000

# Train sentencepiece model in target file
! spm_train --input=$train_tgt --model_prefix=tgt --vocab_size=32000

# Train model in source files
! spm_encode --model=src.model --output_format=sample_piece < $train_src > $train_bpe_src
! spm_encode --model=src.model --output_format=sample_piece < $dev_src > $dev_bpe_src
! spm_encode --model=src.model --output_format=sample_piece < $test_src > $test_bpe_src

# Train model in target files
! spm_encode --model=src.model --output_format=sample_piece < $train_tgt > $train_bpe_tgt
! spm_encode --model=src.model --output_format=sample_piece < $dev_tgt > $dev_bpe_tgt
! spm_encode --model=src.model --output_format=sample_piece < $test_tgt > $test_bpe_tgt

! cp src.vocab /content/drive/MyDrive/Data/sentencepiece/vocab.src
! cp tgt.vocab /content/drive/MyDrive/Data/sentencepiece/vocab.tgt

# create the directory
! mkdir -p $data_path 
! cp /content/drive/MyDrive/Data/sentencepiece/* $data_path
! ls $data_path


# Create the vocab for joeynmt
! sudo chmod 777 joeynmt/scripts/build_vocab.py
! joeynmt/scripts/build_vocab.py joeynmt/data/$src$tgt/train.src joeynmt/data/$src$tgt/train.tgt --output_path joeynmt/data/$src$tgt/vocab.txt
! cp joeynmt/data/$src$tgt/vocab.txt drive/MyDrive/Data/sentencepiece/


## Data is already cleaned

In [None]:
#run only if data is already tokenize(bpe) and model already created
! mkdir -p joeynmt/data/$src$tgt
! cp drive/MyDrive/Data/sentencepiece/* joeynmt/data/$src$tgt
#! cp drive/MyDrive/Data/subwordnmt/* joeynmt/data/$src$tgt

!mkdir -p joeynmt/models/${src}${tgt}_transformer/
#!cp -r drive/MyDrive/joeynmt/models/${src}${tgt}_transformer/* joeynmt/models/${src}${tgt}_transformer

## Config yaml file

In [None]:
# create joeyNMT model

name = '%s%s' % (source_langage, target_langage)
gdrive_path = os.environ['gdrive_path']

# create the config 
# Rename files
! mv joeynmt/data/fren/train.src joeynmt/data/fren/train.fr
! mv joeynmt/data/fren/train.tgt joeynmt/data/fren/train.en
! mv joeynmt/data/fren/dev.src joeynmt/data/fren/dev.fr
! mv joeynmt/data/fren/dev.tgt joeynmt/data/fren/dev.en
! mv joeynmt/data/fren/test.src joeynmt/data/fren/test.fr
! mv joeynmt/data/fren/test.tgt joeynmt/data/fren/test.en
config = """
name : "{name}_transformer"

data: 
  src:  "{source_langage}"
  trg: "{target_langage}"
  train: "data/{name}/train"
  dev: "data/{name}/dev"
  test: "data/{name}/test"
  level: "bpe"
  lowercase: False
  max_sent_length: 100
  src_vocab: "data/{name}/vocab.txt"
  tgr_vocab: "data/{name}/vocab.txt"

testing:
  beam_size: 5
  alpha: 1.0

training:
  #load_model: "{gdrive_path}/models/{name}_transformer/1.ckpt" # if uncommented, load pre-trained model from this checkpoint
  random_seed: 42
  optimizer: "adam"
  normalization: "tokens"
  adam_betas: [0.9, 0.999]
  scheduling: "plateau"
  patience: 5
  learning_rate_factor: 0.5
  learning_rate_warmup: 1000
  decrease_factor: 0.7
  loss: "crossentropy"
  learning_rate: 0.0003
  learning_rate_min: 0.00000001
  weight_decay: 0.0
  label_smoothing: 0.1
  batch_size: 4096
  batch_type: "token"
  eval_batch_size: 3600
  eval_batch_type: "token"
  batch_multiplier: 1
  early_stopping_metric: "ppl"
  epochs: 5
  validation_freq: 30
  logging_freq: 5
  eval_metric: "bleu"
  model_dir: "models/{name}_transformer"
  overwrite: True
  shuffle: True
  use_cuda: True # to use GPU
  max_output_length: 100
  print_valid_sents: [0,1,2,3]
  keep_last_ckpts: 3

model: 
  initializer: "xavier"
  bias_initializer: "zeros"
  init_gain: 1.0
  embed_initializer: "xavier"
  embed_init_gain: 1.0
  #tied_embeddings: True
  tied_softmax: True
  encoder:
    type: "transformer"
    num_layers: 6
    num_heads: 4
    embeddings:
      embedding_dim: 256
      scale: True
      dropout: 0.2
    hidden_size: 256
    ff_size: 1024
    dopout: 0.3
  decoder:
    type: "transformer"
    num_layers: 6
    num_heads: 4
    embeddings:
      embedding_dim: 256
      scale: True
      dropout: 0.2
    hidden_size: 256
    ff_size: 1024
    dopout: 0.3

""".format(name=name, gdrive_path=os.environ['gdrive_path'], source_langage=source_langage, target_langage=target_langage)
with open("joeynmt/configs/transformer_{name}.yaml".format(name=name),'w') as f:
  f.write(config)

In [None]:
# save models in the drive           
! mkdir -p drive/MyDrive/joeynmt/models/${src}${tgt}_transformer/ 
!cp -r joeynmt/models/${src}${tgt}_transformer/* drive/MyDrive/joeynmt/models/${src}${tgt}_transformer/

## Precision and validation

In [None]:
# Validations
! cat drive/MyDrive/joeynmt/models/${src}${tgt}_transformer/validations.txt

In [None]:
%load_ext tensorboard
%tensorboard --logdir joeynmt/models/fren_transformer//tensorboard

## Translate

In [None]:
# translate a sentence
!cd joeynmt/ ; python3 -m joeynmt translate models/fren_transformer/config.yaml 

In [None]:
!cd joeynmt/ ; python3 -m joeynmt translate models/fren_transformer/config.yaml < data/fren/test.fr