## Import data

In [None]:
# connecting to google collab
# https://joeynmt.readthedocs.io/en/latest/py-modindex.html
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Pre-processing

In [None]:
import os
source_langage = 'fr'
target_langage = 'en'
lc = False # lowercase if True
seed = 42 # random seed for shuffling


os.environ['src'] = source_langage
os.environ['tgt'] = target_langage

os.environ['gdrive_path'] = '/content/drive/MyDrive/Data' # path of files

In [None]:
source_file =  '/content/drive/MyDrive/Data/file.src'
target_file = '/content/drive/MyDrive/Data/file.tgt'

! wc -l $source_file
! wc -l $target_file 

!grep '^$' $source_file | wc -l # blank line in source
!grep '^$' $target_file | wc -l # blank line in target

2007723 /content/drive/MyDrive/Data/file.src
2007723 /content/drive/MyDrive/Data/file.tgt
2932
2035


In [None]:
# remove blank lines
! mkdir  "$gdrive_path/clean"

f1 = open('/content/drive/MyDrive/Data/clean/file.src','w')
f2 = open('/content/drive/MyDrive/Data/clean/file.tgt','w')

with open(source_file) as fr, open(target_file) as en:
  for source, target in zip(fr.readlines(), en.readlines()):
    if source != '\n' and target != '\n':
      f1.write(f'{source}')
      f2.write(f'{target}')
f1.close()
f2.close()
source_file = "/content/drive/MyDrive/Data/clean/file.src"
target_file = "/content/drive/MyDrive/Data/clean/file.tgt"

! wc -l $source_file
! wc -l $target_file

!grep '^$' $source_file | wc -l # blank line in source
!grep '^$' $target_file | wc -l # blank line in target

2002756 /content/drive/MyDrive/Data/clean/file.src
2002756 /content/drive/MyDrive/Data/clean/file.tgt
0
0


## Tokenization

In [None]:
! mkdir "$gdrive_path/token"
token_directory = "/content/drive/MyDrive/Data/token/"

### sacremoses

In [None]:
! pip install sacremoses

tok_source_file = token_directory+"file.src"
tok_target_file = token_directory+"file.tgt"

# Tokenize the source
! sacremoses -l $source_langage tokenize < $source_file > $tok_source_file
# Tokenize the target
! sacremoses -l $target_langage tokenize < $target_file > $tok_target_file

! head -n 3 $source_file*
! head -n 3 $target_file*

source_file = tok_source_file
target_file = tok_target_file

### nltk

In [None]:
import numpy as np
import nltk
nltk.download('punkt')

tok_source_file = token_directory+"file.src"
tok_target_file = token_directory+"file.tgt"


def tokenization(file_name, name):
	output = open(name, 'w')
	with open(file_name) as f:
		for line in f.readlines():
			tokens = nltk.word_tokenize(line)
			for word in tokens:
				output.write(f'{word} ') 
			output.write('\n')
	output.close()
	
tokenization(source_file, tok_source_file)
tokenization(target_file,tok_target_file)

! head -n 3 $tok_source_file*
! head -n 3 $tok_target_file*

source_file =  tok_source_file
target_file = tok_target_file

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Reprise de la session 
Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances . 
Comme vous avez pu le constater , le grand `` bogue de l'an 2000 '' ne s'est pas produit . En revanche , les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles . 
Resumption of the session 
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . 
Although , as you will have seen , the dreaded 'millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful . 


## create train, dev and test

In [None]:
# Create trainset and testset

import pandas as pd

source_file =  '/content/drive/MyDrive/Data/token/file.src'
target_file = '/content/drive/MyDrive/Data/token/file.tgt'
max = 500_000 # Numbrer of lines for training
source = []
target = []
source_test = []   
target_test = []
skip_lines = [] # line numbers to skip to the source and the target
  
with open(source_file) as f:
  for i, line in enumerate(f):
    if i == max:
      break  
    # split 20% of training into test
    if i % 5 == 0:
      source_test.append(line.strip())
      skip_lines.append(i) 
    else:
      source.append(line.strip())   
with open(target_file) as f:
  for i, line in enumerate(f):
    if i == max: 
      break  
    if i not in skip_lines:    
      target.append(line.strip())
    else:
      target_test.append(line.strip())

print(f'Number of lines of the corpus {max}, Number of lines of the test {len(skip_lines)}')

df = pd.DataFrame(zip(source, target), columns= ['source_sentence', 'target_sentence'])
test = pd.DataFrame(zip(source_test, target_test), columns=['source_sentence', 'target_sentence'])

print('train...')
print(df.head(3))
print('test...')  
print(test.head(3))  

Number of lines of the corpus 500000, Number of lines of the test 100000
train...
                                     source_sentence                                    target_sentence
0  Je déclare reprise la session du Parlement eur...  I declare resumed the session of the European ...
1  Comme vous avez pu le constater , le grand `` ...  Although , as you will have seen , the dreaded...
2  Vous avez souhaité un débat à ce sujet dans le...  You have requested a debate on this subject in...
test...
                                     source_sentence                                    target_sentence
0                              Reprise de la session                          Resumption of the session
1  Je vous invite à vous lever pour cette minute ...  Please rise , then , for this minute ' s silen...
2  Ne pensez-vous pas , Madame la Présidente , qu...  Would it be appropriate for you , Madam Presid...


In [None]:
# Drop duplicate translations
df_pp = df.drop_duplicates()

# Drop confliting translations
df_pp.drop_duplicates(subset='source_sentence', inplace=True)
df_pp.drop_duplicates(subset='target_sentence', inplace=True)

#shuffle the data to remove bias in dev set selection
df_pp = df_pp.sample(frac = 1, random_state = seed).reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# split train and dev

import csv

! mkdir "$gdrive_path/train"
! mkdir "$gdrive_path/test"
! mkdir "$gdrive_path/dev"

train_src = "/content/drive/MyDrive/Data/train/train.src"
train_tgt = "/content/drive/MyDrive/Data/train/train.tgt"
test_src = "/content/drive/MyDrive/Data/test/test.src"
test_tgt = "/content/drive/MyDrive/Data/test/test.tgt"
dev_src = "/content/drive/MyDrive/Data/dev/dev.src"
dev_tgt = "/content/drive/MyDrive/Data/dev/dev.tgt"

num_dev_patterns = 50_000

if lc: # lowercase
  df_pp['source_sentence'] = df_pp['source_sentence'].str.lower()
  df_pp['target_sentence'] = df_pp['target_sentence'].str.lower()
  test['source_sentence'] = test['source_sentence'].str.lower()
  test['target_sentence'] = test['target_sentence'].str.lower()

dev = df_pp.tail(num_dev_patterns)
stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index)

with open(train_src, 'w') as src_file, open(train_tgt,'w') as trg_file:
  for index, row in stripped.iterrows():
    src_file.write(row['source_sentence']+'\n')
    trg_file.write(row['target_sentence']+'\n')

with open(dev_src, 'w') as src_file, open(dev_tgt,'w') as trg_file:
  for index, row in dev.iterrows():
    src_file.write(row['source_sentence']+'\n')
    trg_file.write(row['target_sentence']+'\n')

with open(test_src, 'w') as src_file, open(test_tgt,'w') as trg_file:
  for index, row in test.iterrows():
    src_file.write(row['source_sentence']+'\n')
    trg_file.write(row['target_sentence']+'\n')
  
! head /content/drive/MyDrive/Data/train/*
! head /content/drive/MyDrive/Data/dev/*
! head /content/drive/MyDrive/Data/test/*

==> /content/drive/MyDrive/Data/train/train.src <==
L ' UE dispose d ' un tout autre moyen pour appliquer des sanctions , à savoir la Cour de justice européenne .
Pour ce qui est de la politique de sécurité , je me réjouis des progrès en cours de réalisation , mais assurons-nous de nous concentrer sur la capacité et non sur les structures institutionnelles , car nous ne serons jugés que sur notre capacité .
Je pense aussi au rapport entre les piliers .
Comme d ’ autres orateurs l ’ ont souligné , le thon est aussi important pour les régions du Sud de l ’ Europe que ne le sont le cabillaud et le merlu pour les régions du Nord de l ’ Europe . Ceux d ’ entre nous qui viennent du Nord peuvent dès lors comprendre les inquiétudes qui s ’ expriment .
Je voudrais rappeler que ce Parlement a adopté une résolution dans laquelle il déclarait que les clauses commerciales inscrites dans le traité d'association devaient être appliquées , le cas échéant , afin de pousser à la cessation de la politiqu

## Install openNMT

In [None]:
# open-nmt
!pip3 install OpenNMT-tf

Collecting OpenNMT-tf
[?25l  Downloading https://files.pythonhosted.org/packages/70/e1/6fc1b73a32014593536e04e3ab787c6acff04bf5e5cc51eeed0e0b0b263a/OpenNMT_tf-2.20.0-py3-none-any.whl (154kB)
[K     |████████████████████████████████| 163kB 9.4MB/s 
[?25hCollecting sacrebleu<1.6,>=1.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)
[K     |████████████████████████████████| 61kB 9.8MB/s 
[?25hCollecting rouge<2,>=1.0
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Collecting pyyaml<5.5,>=5.3
[?25l  Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)
[K     |████████████████████████████████| 645kB 36.4MB/s 
[?25hCollecting tensorflow-addons<0.1

In [None]:
# change cuda version
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
[?25l  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (1982.2MB)
[K     |█████████████▌                  | 834.1MB 1.4MB/s eta 0:13:46tcmalloc: large alloc 1147494400 bytes == 0x56536fac8000 @  0x7f307a8eb615 0x565336a5acdc 0x565336b3a52a 0x565336a5dafd 0x565336b4efed 0x565336ad1988 0x565336acc4ae 0x565336a5f3ea 0x565336ad17f0 0x565336acc4ae 0x565336a5f3ea 0x565336ace32a 0x565336b4fe36 0x565336acd853 0x565336b4fe36 0x565336acd853 0x565336b4fe36 0x565336acd853 0x565336b4fe36 0x565336bd23e1 0x565336b326a9 0x565336a9dcc4 0x565336a5e559 0x565336ad24f8 0x565336a5f30a 0x565336acd3b5 0x565336acc7ad 0x565336a5f3ea 0x565336acd3b5 0x565336a5f30a 0x565336acd3b5
[K     |█████████████████               | 1055.7MB 1.5MB/s eta 0:10:21tcmalloc: large alloc 1434370048 bytes == 0x5653b411e000 @  0x7f307a8eb615 0x565336a5acdc 0x565336b3a52a 0x565336a5da

## BPE

### subwordnmt

In [None]:
# Usually, NMT would tokenize by words. However, using BPE boosts the performance
# Byte pair encoding: decrease size of memory(large vocabulary to represent word pretty well)
# subword NMT
!pip install subword-nmt
from os import path
os.environ['src'] = source_langage
os.environ['tgt'] = target_langage
! mkdir "$gdrive_path/subwordnmt"
vocab_src = "/content/drive/MyDrive/Data/subwordnmt/vocab.src"
vocab_tgt = "/content/drive/MyDrive/Data/subwordnmt/vocab.tgt"

train_bpe_src = '/content/drive/MyDrive/Data/subwordnmt/train.src'
train_bpe_tgt = '/content/drive/MyDrive/Data/subwordnmt/train.tgt'
test_bpe_src = '/content/drive/MyDrive/Data/subwordnmt/test.src'
test_bpe_tgt = '/content/drive/MyDrive/Data/subwordnmt/test.tgt'
dev_bpe_src = '/content/drive/MyDrive/Data/subwordnmt/dev.src'
dev_bpe_tgt = '/content/drive/MyDrive/Data/subwordnmt/dev.tgt'
# Learn BPE on the training data
! subword-nmt learn-joint-bpe-and-vocab --input $train_src $train_tgt -s 32000 -o bpe.codes --write-vocabulary $vocab_src $vocab_tgt

# Apply bpe on train, dev and test
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_src < $train_src > $train_bpe_src
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_tgt < $train_tgt > $train_bpe_tgt

! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_src < $dev_src > $dev_bpe_src
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_tgt < $dev_tgt > $dev_bpe_tgt

! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_src < $test_src > $test_bpe_src
! subword-nmt apply-bpe -c bpe.codes --vocabulary $vocab_tgt < $test_tgt > $test_bpe_tgt

# somme output
! echo "BPE sentences source"
! head -n 5 $train_bpe_src
! echo "BPE sentences target"
! head -n 5 $train_bpe_tgt
! echo "VOCAB SOURCE"
! head $vocab_src
! echo "VOCAB TARGET"
! head $vocab_tgt

BPE sentences source
L ' UE dispose d ' un tout autre moyen pour appliquer des sanctions , à savoir la Cour de justice européenne .
Pour ce qui est de la politique de sécurité , je me réjouis des progrès en cours de réalisation , mais assur@@ ons-nous de nous concentrer sur la capacité et non sur les structures institutionnelles , car nous ne serons jugés que sur notre capacité .
Je pense aussi au rapport entre les piliers .
Comme d ’ autres orateurs l ’ ont souligné , le thon est aussi important pour les régions du Sud de l ’ Europe que ne le sont le cabillaud et le merlu pour les régions du Nord de l ’ Europe . Ceux d ’ entre nous qui viennent du Nord peuvent dès lors comprendre les inquiétudes qui s ’ expriment .
Je voudrais rappeler que ce Parlement a adopté une résolution dans laquelle il déclarait que les clauses commerciales inscrites dans le traité d'association devaient être appliquées , le cas échéant , afin de pousser à la cessation de la politique d'@@ implantation , qui de

### sentencepiece

In [None]:
! pip install sentencepiece
! sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev

! git clone https://github.com/google/sentencepiece.git ; cd sentencepiece ; mkdir build ; cd build; cmake .. ; make -j $(nproc) ; sudo make install ; sudo ldconfig -v

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 7.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96
Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
pkg-config is already the newest version (0.29.1-0ubuntu2).
cmake is already the newest version (3.10.2-1ubuntu2.18.04.1).
The following additional packages will be installed:
  libunwind-dev
The following NEW packages will be installed:
  libgoogle-perftools-dev libunwind-dev
0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.
Need to get 627 kB of archives.
After this operation, 6,761 kB of additional disk space will be used.
Get:1 h

In [None]:
from os import path
os.environ['src'] = source_langage
os.environ['tgt'] = target_langage
! mkdir "$gdrive_path/sentencepiece"
vocab_src = "/content/drive/MyDrive/Data/sentencepiece/vocab.src" # lien
vocab_tgt = "/content/drive/MyDrive/Data/sentencepiece/vocab.tgt" # lien


train_src = "/content/drive/MyDrive/Data/train/train.src"
train_tgt = "/content/drive/MyDrive/Data/train/train.tgt"
test_src = "/content/drive/MyDrive/Data/test/test.src"
test_tgt = "/content/drive/MyDrive/Data/test/test.tgt"
dev_src = "/content/drive/MyDrive/Data/dev/dev.src"
dev_tgt = "/content/drive/MyDrive/Data/dev/dev.tgt"


train_bpe_src = '/content/drive/MyDrive/Data/sentencepiece/train.src'
train_bpe_tgt = '/content/drive/MyDrive/Data/sentencepiece/train.tgt'
test_bpe_src = '/content/drive/MyDrive/Data/sentencepiece/test.src'
test_bpe_tgt = '/content/drive/MyDrive/Data/sentencepiece/test.tgt'
dev_bpe_src = '/content/drive/MyDrive/Data/sentencepiece/dev.src'
dev_bpe_tgt = '/content/drive/MyDrive/Data/sentencepiece/dev.tgt'

# Train sentencepiece model in source file (src.model and src.vocab are generated)
! spm_train --input=$train_src  --model_prefix=src --vocab_size=32000

# Train sentencepiece model in target file
! spm_train --input=$train_tgt --model_prefix=tgt --vocab_size=32000

# Train model in source files
! spm_encode --model=src.model --output_format=sample_piece < $train_src > $train_bpe_src
! spm_encode --model=src.model --output_format=sample_piece < $dev_src > $dev_bpe_src
! spm_encode --model=src.model --output_format=sample_piece < $test_src > $test_bpe_src

# Train model in target files
! spm_encode --model=tgt.model --output_format=sample_piece < $train_tgt > $train_bpe_tgt
! spm_encode --model=tgt.model --output_format=sample_piece < $dev_tgt > $dev_bpe_tgt
! spm_encode --model=tgt.model --output_format=sample_piece < $test_tgt > $test_bpe_tgt

! cp src.vocab /content/drive/MyDrive/Data/sentencepiece/vocab.src
! cp tgt.vocab /content/drive/MyDrive/Data/sentencepiece/vocab.tgt
!cp src.model /content/drive/MyDrive/Data/sentencepiece/
!cp tgt.model /content/drive/MyDrive/Data/sentencepiece/

# Affichage
! echo "BPE sentence source"
! head -n 5 $train_bpe_src

! echo "BPE sentence target"
! head -n 5 $train_bpe_tgt

mkdir: cannot create directory ‘/content/drive/MyDrive/Data/sentencepiece’: File exists
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /content/drive/MyDrive/Data/train/train.src
  input_format: 
  model_prefix: src
  model_type: UNIGRAM
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  u

## Create vocab for omt

[Generate vocabulary](https://opennmt.net/OpenNMT-tf/vocabulary.html)

In [None]:
! mkdir drive/MyDrive/opennmt
! onmt-build-vocab --save_vocab drive/MyDrive/opennmt/vocab.src --size 32000 drive/MyDrive/Data/sentencepiece/train.src # vocab for src file
! onmt-build-vocab --save_vocab drive/MyDrive/opennmt/vocab.tgt --size 32000 drive/MyDrive/Data/sentencepiece/train.tgt # vocab for target file

2021-06-17 13:07:39.559794: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-06-17 13:07:45.780422: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-06-17 13:07:45.874717: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-06-17 13:07:45.874785: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (10d857964a67): /proc/driver/nvidia/version does not exist
2021-06-17 13:08:03.617319: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-06-17 13:08:05.147482: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-06-17 13:08:05.157485: E tensorflow/stream_executor/cuda

## Train

In [None]:
! mkdir run
config = '''
model_dir: /content/run

data:
  train_features_file: drive/MyDrive/Data/sentencepiece/train.src
  train_labels_file: drive/MyDrive/Data/sentencepiece/train.tgt
  eval_features_file: drive/MyDrive/Data/sentencepiece/dev.src
  eval_labels_file: drive/MyDrive/Data/sentencepiece/dev.tgt
  source_vocabulary: drive/MyDrive/opennmt/vocab.src
  target_vocabulary: drive/MyDrive/opennmt/vocab.tgt

train:
  batch_size: 1024
  save_checkpoints_steps: 1000
  maximum_features_length: 50
  maximum_labels_length: 50

eval:
    eval_delay: 180  # Every 30 mn
    external_evaluators: BLEU
    export_on_best: BLEU
infer:
    batch_size: 32
'''

f = open('data.yml','w')
f.write(config)
f.close()

! onmt-main --model_type Transformer --config data.yml --auto_config  train --with_eval --num_gpus 1 

mkdir: cannot create directory ‘run’: File exists
2021-06-21 18:30:42.832974: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-06-21 18:30:44.123195: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-06-21 18:30:44.153821: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-06-21 18:30:44.154409: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-06-21 18:30:44.154452: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-06-21 18:30:44.157

## Translate

In [None]:
#! onmt-main infer --auto_config  --config data.yml --features_file drive/MyDrive/joeynmt/fr-en-data/test.fr.bpe --predictions_file drive/MyDrive/opennmt/prediction.bpe --checkpoint_path drive/MyDrive/opennmt/run/model.1000
#onmt-main --config data.yml --auto_config infer --features_file src-test.txt
!sed -n '1,10p' drive/MyDrive/Data/sentencepiece/test.src > example.bpe
! onmt-main  --config data.yml  --auto_config infer --features_file example.bpe --predictions_file example.tr.bpe 
# ! onmt-main  --config data.yml  --auto_config --checkpoint_path run/ckpt-10000.index infer --features_file example.bpe --predictions_file example.tr.bpe

2021-06-18 09:22:05.621644: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-06-18 09:22:06.870991: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-06-18 09:22:06.889322: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-06-18 09:22:06.889765: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla P4 computeCapability: 6.1
coreClock: 1.1135GHz coreCount: 20 deviceMemorySize: 7.43GiB deviceMemoryBandwidth: 178.99GiB/s
2021-06-18 09:22:06.889810: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-06-18 09:22:06.893020: I tensorflow/stream_executor/platform/defaul

In [None]:
# Detokenization
#! cat drive/MyDrive/opennmt/prediction.bpe | sed "s/@@ //g" > drive/MyDrive/opennmt/test_prediction.en

#! spm_decode --model=drive/MyDrive/Data/sentencepiece/tgt.model --input_format=sample_piece < example.tr.bpe > example.tr.en
#! cat prediction.bpe | sed "s/@@ //g" > test_prediction.en

/bin/bash: spm_decode: command not found


In [None]:
## BLU score
! spm_decode
! perl  OpenNMT-tf/third_party/multi-bleu.perl drive/MyDrive/sentencepiece/test.tgt < test_prediction.en