https://docs.google.com/document/d/1QmoZJCeAjmaZQ2QQA3X2zqo9ET4PqSUT2frL4DX_Tdo/edit

In [1]:
!pip install transformers
!pip install fasttext

  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3020481 sha256=31f886821f2c8f4d8659274cbae434eea28d153ea55e3c8cb73312ac5523b99f
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [0]:
import numpy as np
import pandas as pd
import os, sys
import re, string, nltk, spacy, random, os, io
from collections import Counter
from pickle import dump, load
from unicodedata import normalize

In [3]:
import tensorflow as tf
import torch, keras
import tensorflow_datasets as tfds
import fasttext, fasttext.util
from transformers import BertTokenizer, BertModel, BertConfig # bert-base-uncased
from transformers import CamembertModel, CamembertTokenizer, CamembertConfig # camembert-base

Using TensorFlow backend.


In [0]:
# sys.path.insert(0, '/content/drive/My Drive/')
# from utils import clean_corpus, to_vocab, update_corpus, drop_nulls

# Data Preprocessing

### Functions

In [0]:
def clean_corpus(corpus):
	cleaned = list()
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	table = str.maketrans('', '', string.punctuation)
	for line in corpus:
		line = normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		line = line.split()
		line = [word.lower() for word in line]
		line = [word.translate(table) for word in line]
		line = [re_print.sub('', w) for w in line]
		line = [word for word in line if word.isalpha()]
		cleaned.append(' '.join(line))
	return cleaned

In [0]:
def to_vocab(corpus, min_occurance = 0):
  tokenizer = keras.preprocessing.text.Tokenizer(filters='')
  tokenizer.fit_on_texts(corpus)
  vocab = [k for k,v in tokenizer.word_counts.items() if v > min_occurance]
  return vocab

In [0]:
def update_corpus(corpus, vocab):
	clean_corpus = list()
	for line in corpus:
		new_tokens = list()
		for token in line.split():
			if token in vocab:
				new_tokens.append(token)
			else:
				new_tokens.append('unk')
		new_line = ' '.join(new_tokens)
		clean_corpus.append(new_line)
	return clean_corpus

In [0]:
def drop_nulls(corpus1, corpus2):
    lengths = [len(line) for line in corpus1]
    idx = [i for i,line in enumerate(corpus1) if len(line)>0]

    corpus1 = [corpus1[i] for i in idx]
    corpus2 = [corpus2[i] for i in idx]

    return corpus1, corpus2

In [0]:
def preprocess(corpus, min_occurance=5):
  corpus = clean_corpus(corpus)
  vocab = to_vocab(corpus, min_occurance)
  corpus = update_corpus(corpus, vocab)
  return corpus, vocab

## Small Vocab

In [0]:
path = '/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/'
files = os.listdir(path)
files

['small_vocab_En.txt',
 'small_vocab_Fr.txt',
 '.ipynb_checkpoints',
 'en.pkl',
 'fr.pkl',
 'en_vocab.pkl',
 'fr_vocab.pkl',
 'en_ft_vocab.pkl',
 'fr_ft_vocab.pkl',
 'en_bert_vocab.pkl',
 'fr_bert_vocab.pkl']

In [0]:
en = open(path+files[0],'r').read().split('\n')
fr = open(path+files[1],'r').read().split('\n')

print(len(en), len(fr))
print(en[:1])
print(fr[:1])

137860 137860
['new jersey is sometimes quiet during autumn , and it is snowy in april .']
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]


In [0]:
en_lengths = [len(line.split()) for line in en]
fr_lengths = [len(line.split()) for line in fr]

print('Eng:',max(en_lengths), min(en_lengths))
print('Fr:',max(fr_lengths), min(fr_lengths))

Eng: 17 3
Fr: 23 3


In [0]:
en, en_vocab = preprocess(en)
fr, fr_vocab = preprocess(fr)

In [0]:
for _ in range(3):
  i = random.randint(0, len(en))
  print(i,en[i])
  print(i,fr[i])

In [0]:
len(en_vocab), len(fr_vocab)

In [0]:
dump(en_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en_vocab.pkl', 'wb'))
dump(fr_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr_vocab.pkl', 'wb'))

In [0]:
dump(en, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en.pkl', 'wb'))
dump(fr, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr.pkl', 'wb'))

## Europarl

In [0]:
path = '/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/'
files = os.listdir(path)
files

['english-fr.txt',
 'french-en.txt',
 '.ipynb_checkpoints',
 'en_vocab.pkl',
 'fr_vocab.pkl',
 'en.pkl',
 'fr.pkl']

In [0]:
en = open(path+files[0],'r').read().strip().split('\n')
fr = open(path+files[1],'r').read().strip().split('\n')

print(len(en), len(fr))
print(en[:1])
print(fr[:1])

2007723 2007723
['Resumption of the session']
['Reprise de la session']


In [0]:
en_lengths = [len(line.split()) for line in en]
fr_lengths = [len(line.split()) for line in fr]
print('Eng:',max(en_lengths), min(en_lengths))
print('Fr:',max(fr_lengths), min(fr_lengths))

Eng: 668 0
Fr: 693 0


In [0]:
df = pd.DataFrame({'en':en, 'fr':fr}).sample(n=32000, random_state=42)
en, fr = df.en.values, df.fr.values

In [0]:
for _ in range(2):
  i = random.randint(0, len(en))
  print(i,en[i])
  print(i,fr[i])

8999 They have suffered enough.
8999 Ce peuple a assez souffert.
26730 Mr President, Mr Sacrédeus has, I think, addressed something of essential importance.
26730 Monsieur le Président, je suis d'avis que notre collègue Sacrédeus vient d'aborder un point essentiel.


In [0]:
en = clean_corpus(en)
fr = clean_corpus(fr)

In [0]:
print(len(en), len(fr))
en, fr = drop_nulls(en, fr)
print(len(en), len(fr))
fr, en = drop_nulls(fr, en)
print(len(en), len(fr))

32000 32000
31874 31874
31832 31832


In [0]:
df = pd.DataFrame({'en':en, 'fr':fr}).sample(n=30000, random_state=42)
en, fr = df.en.values, df.fr.values
print(len(en), len(fr))

30000 30000


In [0]:
en_vocab = to_vocab(en, min_occurance=5)
en = update_corpus(en, en_vocab)
fr_vocab = to_vocab(fr, min_occurance=5)
fr = update_corpus(fr, fr_vocab)

In [0]:
len(en_vocab), len(to_vocab(en)), len(fr_vocab), len(to_vocab(fr))

(6199, 6200, 7813, 7814)

In [0]:
en_lengths = [len(line.split()) for line in en]
fr_lengths = [len(line.split()) for line in fr]

print('Eng:',max(en_lengths), min(en_lengths))
print('Fr:',max(fr_lengths), min(fr_lengths))

Eng: 202 1
Fr: 208 1


In [0]:
for _ in range(2):
  i = random.randint(0, len(en))
  print(i,en[i])
  print(i,fr[i])

25748 what is the marking exactly
25748 questce que le unk exactement
713 honourable members in the last hour reports have reached me of what now appears to be a coordinated series of attacks on unk transport system
713 honorables deputes au cours de lheure qui vient de unk jai recu des informations concernant ce qui apparait maintenant comme une serie unk coordonnes dans le systeme de transport unk


In [0]:
dump(en_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/en_vocab.pkl', 'wb'))
dump(fr_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/fr_vocab.pkl', 'wb'))

In [0]:
dump(en, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/en.pkl', 'wb'))
dump(fr, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/fr.pkl', 'wb'))

# Embeddings

#### Functions

In [0]:
def unk_vector(vocab):
  vectors = np.array([v for k,v in vocab.items()])
  vector = [np.mean(vectors[:,i]) for i in range(vectors.shape[1])]
  return vector

In [0]:
def bert_embeddings(vocab, model, tokenizer, special_chars):
  text = ' '.join(list(vocab))
  marked_text = special_chars[0] + text + special_chars[1]
  tokens = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
  segments_ids = [1] * len(tokens)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  with torch.no_grad():
      encoded_layers, _ = model(tokens_tensor, segments_tensors)
  embeddings = torch.squeeze(encoded_layers, dim=0)

  return tokens, embeddings

In [0]:
def combine_embs(vocab, words, embeddings, max_length):
  bert_vocab = []
  for word, idx in zip(vocab, words):
    emb = np.mean(np.array([embeddings[i].numpy() for i in idx]), axis=0)
    bert_vocab += [emb]
  for i in range(max_length-len(vocab)):
    bert_vocab += [np.zeros((768,))]
  return bert_vocab

In [0]:
def combine_tokens_en(tokens):
  words = []
  word = None
  for i,t in enumerate(tokens[:-1]):
    if i == 0: continue
    if t[:2] == '##':
      word += [i]
    else:
      if word is not None:
        words += [word]
      word = [i]
  words += [word]
  return words

In [0]:
def combine_tokens_fr(tokens):
  words = []
  word = None
  for i,t in enumerate(tokens[:-1]):
    if i == 0: continue
    if t[0] == '▁':
      if word is not None:
        words += [word]
      word = [i]
    else:
      word += [i]
  words += [word]
  return words

## Small Vocab

In [4]:
en = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en.pkl', 'rb'))
fr = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr.pkl', 'rb'))
len(en), len(fr)

(137860, 137860)

In [5]:
en_vocab = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en_vocab.pkl', 'rb'))
fr_vocab = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr_vocab.pkl', 'rb'))
fr_vocab += ['geles']
len(en_vocab), len(fr_vocab)

(199, 322)

### FastText

In [0]:
ft = fasttext.load_model('/content/drive/My Drive/Datasets/NLP/Models/FastText/cc.en.300.bin')
en_ft_vocab = dict([(word, np.array(ft.get_word_vector(word))) for word in en_vocab])
en_ft_vocab['unk'] = unk_vector(en_ft_vocab)
en_ft_vocab['<start>'] = np.random.rand(300).tolist()
en_ft_vocab['<end>'] = np.random.rand(300).tolist()

In [9]:
print(len(en_ft_vocab))
dump(en_ft_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/en_ft_vocab.pkl', 'wb'))

202


In [0]:
del en_ft_vocab, ft

In [11]:
ft = fasttext.load_model('/content/drive/My Drive/Datasets/NLP/Models/FastText/cc.fr.300.bin')
fr_ft_vocab = dict([(word, ft.get_word_vector(word)) for word in fr_vocab])
fr_ft_vocab['unk'] = unk_vector(fr_ft_vocab)
fr_ft_vocab['<start>'] = np.random.rand(300).tolist()
fr_ft_vocab['<end>'] = np.random.rand(300).tolist()



In [0]:
print(len(fr_ft_vocab))
dump(fr_ft_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Small Vocab/fr_ft_vocab.pkl', 'wb'))

In [0]:
del fr_ft_vocab, ft

## Europarl

In [0]:
en = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/en.pkl', 'rb'))
fr = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/fr.pkl', 'rb'))
len(en), len(fr)

(30000, 30000)

In [0]:
en_vocab = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/en_vocab.pkl', 'rb'))
fr_vocab = load(open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/fr_vocab.pkl', 'rb'))
len(en_vocab), len(fr_vocab)

(6199, 7813)

### FastText

In [0]:
ft = fasttext.load_model('/content/drive/My Drive/Datasets/NLP/Models/FastText/cc.en.300.bin')
en_ft_vocab = dict([(word, ft.get_word_vector(word)) for word in en_vocab])
en_ft_vocab['unk'] = unk_vector(en_ft_vocab)
en_ft_vocab['<start>'] = np.random.rand(300).tolist()
en_ft_vocab['<end>'] = np.random.rand(300).tolist()



In [0]:
print(len(en_vocab),len(en_ft_vocab))
dump(en_ft_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/en_ft_vocab.pkl', 'wb'))

6199 6202


In [0]:
ft = fasttext.load_model('/content/drive/My Drive/Datasets/NLP/Models/FastText/cc.fr.300.bin')
fr_ft_vocab = dict([(word, ft.get_word_vector(word)) for word in fr_vocab])
fr_ft_vocab['unk'] = unk_vector(fr_ft_vocab)
fr_ft_vocab['<start>'] = np.random.rand(300).tolist()
fr_ft_vocab['<end>'] = np.random.rand(300).tolist()



In [0]:
print(len(fr_vocab), len(fr_ft_vocab))
dump(fr_ft_vocab, open('/content/drive/My Drive/Datasets/NLP/MT/French-English/Europarl/fr_ft_vocab.pkl', 'wb'))

7813 7816
