In [13]:
# # !gunzip cx-corpora.en2es.text.json.gz
# !pip install --user pybind11
# !pip install fasttext
#!conda install -c conda-forge fasttext

In [51]:
import json
import fasttext
import pandas as pd
import numpy as np
import re
import os
import unicodedata
from sklearn.model_selection import train_test_split

In [52]:
def apply_transform(vec, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform

    Transform can either be a string with a filename to a
    text file containing a ndarray (compat. with np.loadtxt)
    or a numpy ndarray.
    """
    transmat = np.loadtxt(transform)# if isinstance(transform, str) else transform
    return np.matmul(vec, transmat)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    
    len_bd = len(bilingual_dictionary)

    for i, (source, target) in enumerate(bilingual_dictionary):
        print(f'\r{i + 1}/{len_bd} | {100 * (i + 1) / len_bd:.3f} %', end = '', flush = True)
        sourceVector = source_dictionary.get_sentence_vector(source.lower().strip().replace('_',' '))
        targetVector = target_dictionary.get_sentence_vector(target.lower().strip().replace('_',' '))
        source_matrix.append(sourceVector)
        target_matrix.append(targetVector)
        
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(target_matrix.transpose(), source_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [39]:
# file = ('cx-corpora.en2es.text.json')

# with open(file, encoding='utf-8') as f:
#     d = json.load(f)

In [40]:
# model = {}
# model['en'] = fasttext.load_model('wiki.en.bin')
# model['es'] = fasttext.load_model('wiki.es.bin')





In [41]:
# data=[]
# for line in d:
#     try:
#         source = unicodedata.normalize('NFKC',line['source']['content'])
#         target = unicodedata.normalize('NFKC',line['target']['content'])
#         if not source == '' and not target == '':
#             data.append([source,target])
#     except TypeError:
#         pass
        

In [42]:
## Code to split train and test
# df = pd.DataFrame(data,columns = ['source','target'])
# train, test = train_test_split(df, test_size=0.2, random_state=42)

In [53]:
# test.to_csv('content_trans_test.csv',index=False)
# train.to_csv('content_trans_train.csv',index=False)
# test = pd.read_csv('content_test.csv') #same for everyone
# train = pd.read_csv('content_train.csv') #replace with your respective dataset

In [21]:
#Train
#bilingual_dictionary = list(zip(train['source'],train['target']))
#source_matrix, target_matrix = make_training_matrices(model['en'], model['es'], bilingual_dictionary)

In [22]:
#np.save('source_matrix',source_matrix)
#np.save('target_matrix',target_matrix)

In [54]:
source_matrix = np.load('source_matrix.npy')
target_matrix = np.load('target_matrix.npy')

In [55]:
#Computing transform matrix
#transform = learn_transformation(source_matrix, target_matrix)
# # ## Save transform as .txt file
#np.savetxt('content_trans_transform_en_es.txt', transform)
transform = np.loadtxt('content_trans_transform_en_es.txt')

In [27]:
#before
print("Before trans:",np.matmul(target_matrix[:50000, :50000], source_matrix[:50000, :50000].T).mean())
#after
print("After trans:",np.matmul(target_matrix[:50000, :50000], np.matmul(transform, source_matrix[:50000, :50000].T)).mean())

Before trans: 0.0008891855
After trans: 0.19440520936342


In [56]:
# #Test
# bilingual_dictionary = list(zip(test['source'],test['target']))
# source_matrix_test, target_matrix_test = make_training_matrices(model['en'], model['es'], bilingual_dictionary)

72972/72972 | 100.000 %

In [57]:
# np.save('source_matrix_test',source_matrix_test)
# np.save('target_matrix_test',target_matrix_test)
source_matrix_test = np.load('source_matrix_test.npy')
target_matrix_test = np.load('target_matrix_test.npy')

In [59]:
#before
print("Before trans:",np.matmul(target_matrix_test, source_matrix_test.T).mean())
#after
print("After trans:",np.matmul(target_matrix_test, np.matmul(transform, source_matrix_test.T)).mean())

Before trans: 0.0009806056
After trans: 0.19406951518768453
