In [1]:
import os
import pandas as pd
import fasttext
import numpy as np
import time

### Training
- Orthogonal transformation acts on source matrix (or the transpose acts on the target matrix).
- Currently `en` is treated as source and `es` is treated as target. 

In [2]:
def apply_transform(vec, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform

    Transform can either be a string with a filename to a
    text file containing a ndarray (compat. with np.loadtxt)
    or a numpy ndarray.
    """
    transmat = np.loadtxt(transform)# if isinstance(transform, str) else transform
    return np.matmul(vec, transmat)

In [3]:
def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    
    len_bd = len(bilingual_dictionary)

    for i, (source, target) in enumerate(bilingual_dictionary):
        print(f'\r{i + 1}/{len_bd} | {100 * (i + 1) / len_bd:.3f} %', end = '', flush = True)
        sourceVector = source_dictionary.get_sentence_vector(source.lower().strip().replace('_',' '))
        targetVector = target_dictionary.get_sentence_vector(target.lower().strip().replace('_',' '))
        source_matrix.append(sourceVector)
        target_matrix.append(targetVector)
        
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

In [4]:
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)

In [5]:
def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(target_matrix.transpose(), source_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [6]:
def apply_transform(vec, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform

    Transform can either be a string with a filename to a
    text file containing a ndarray (compat. with np.loadtxt)
    or a numpy ndarray.
    """
    transmat = np.loadtxt(transform)# if isinstance(transform, str) else transform
    return np.matmul(vec, transmat)

## Modify depending on your directory structure

In [7]:
model_path = os.path.join('embeddings', 'fasttext')
sentences_path = os.path.join('sentences', 'wikimatrix')

In [8]:
model = {}

model['en'] = fasttext.load_model(os.path.join(model_path, 'wiki.en.bin'))
model['es'] = fasttext.load_model(os.path.join(model_path, 'wiki.es.bin'))





In [9]:
with open(os.path.join(sentences_path, 'WikiMatrix.en-es.txt.en'), 'r') as f:
    en = f.read()
    
with open(os.path.join(sentences_path, 'WikiMatrix.en-es.txt.es'), 'r') as f:
    es = f.read()
    
df = pd.DataFrame(zip(en.split('\n')[:-1], es.split('\n')[:-1]), columns = ['en', 'es'])
df.head()

Unnamed: 0,en,es
0,He aquí el relato de la derrota del Markawiz ¡...,He aquí el relato de la derrota del Markawiz ¡...
1,He never fights alongside or against Gordon Fr...,Nunca combate al lado o en contra de Gordon Fr...
2,Nada más que la verdad: el juicio a las juntas.,Nada más que la verdad: el juicio a las juntas.
3,We live among them and rarely notice this big ...,Vivimos entre ellos y notamos raramente esa gr...
4,"He said: ""I am four years old.",Dijo: «Serán mis últimos cuatro años».


In [10]:
bilingual_dictionary = df.values

In [11]:
source_matrix, target_matrix = make_training_matrices(model['en'], model['es'], bilingual_dictionary)

3373227/3377911 | 99.861 %

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3377911/3377911 | 100.000 %

In [13]:
if False:
    np.savetxt(os.path.join(sentences_path, 'source.en.txt'), source_matrix)
    np.savetxt(os.path.join(sentences_path, 'target.es.txt'), target_matrix)

In [None]:
if False:
    source_matrix = np.loadtxt(os.path.join(sentences_path, 'source.en.txt'))
    target_matrix = np.loadtxt(os.path.join(sentences_path, 'target.es.txt'))

In [15]:
transform = learn_transformation(source_matrix, target_matrix)

In [16]:
np.savetxt(os.path.join(sentences_path, 'transform.en-es.txt'), transform)

In [20]:
target_matrix.shape

(3377911, 300)

In [None]:
similarity = np.matmul(target_matrix[:100000, :100000], np.matmul(transform, source_matrix.T)[:100000, :100000])

In [None]:
similarity.shape

In [None]:
#similarity = np.matmul(target_matrix, np.matmul(transform, source_matrix.T))
print(f'Similarity: {np.mean(similarity)}')