In [1]:
# # !gunzip cx-corpora.en2es.text.json.gz
# !pip install --user pybind11
# !pip install fasttext
#!conda install -c conda-forge fasttext

In [1]:
import json
import fasttext
import pandas as pd
import numpy as np
import re
import os
import unicodedata
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from numpy.core.umath_tests import inner1d
from os import listdir

  # Remove the CWD from sys.path while we load stuff.


In [2]:
def apply_transform(vec, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform

    Transform can either be a string with a filename to a
    text file containing a ndarray (compat. with np.loadtxt)
    or a numpy ndarray.
    """
    transmat = np.loadtxt(transform)# if isinstance(transform, str) else transform
    return np.matmul(vec, transmat)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    
    len_bd = len(bilingual_dictionary)

    for i, (source, target) in tqdm(enumerate(bilingual_dictionary)):
#         print(f'\r{i + 1}/{len_bd} | {100 * (i + 1) / len_bd:.3f} %', end = '', flush = True)
        sourceVector = source_dictionary.get_sentence_vector(source.lower().strip().replace('_',' '))
        targetVector = target_dictionary.get_sentence_vector(target.lower().strip().replace('_',' '))
        source_matrix.append(sourceVector)
        target_matrix.append(targetVector)
        
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(target_matrix.transpose(), source_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

def cleanData(data):
    data_cleaned = []
    for source, target in zip(data['source_value'], data['target_value']):
        try:
            source_cleaned = unicodedata.normalize('NFD', source)
            target_cleaned = unicodedata.normalize('NFD', target)
            if not source == ''  and not target == '':
                data_cleaned.append([source_cleaned, target_cleaned])
        except:
            pass
    return data_cleaned


def check(source, target):
    val = [source, target]
    return val != sorted(val)

def loadData(source, target):
    if check(source, target):
        return loadData(target, source)
    
    file = '{}2{}.csv'.format(source, target)
    path = '/scratch/mje341/capstoneWikimedia/Training_description/files/'
    files = listdir(path)
    retVal = 'description_train_{}'.format(file)
    if retVal in files:
        return pd.read_csv(os.path.join(path, retVal))
    elif file in files:
        data = pd.read_csv(os.path.join(path, file), error_bad_lines = False)
        data = data.drop(['ID'], axis = 1)
        data = data.drop_duplicates()
        df = pd.DataFrame(cleanData(data), columns = ['source', 'target'])
        df.to_csv(os.path.join(path, retVal))
        return pd.read_csv(os.path.join(path, retVal))
    else:
        raise Exception(("{0} not in directory {1}".format(file, path)))

In [10]:
model = {}
source_language = 'en'
target_language = 'es'
model['en'] = fasttext.load_model('/scratch/dev241/capstone/fast/wiki.en.bin')
model['es'] = fasttext.load_model('/scratch/dev241/capstone/fast/wiki.es.bin')





In [20]:
test = pd.read_csv('/scratch/ah3243/content_test.csv') #trans_test_en_es.csv') #same for everyone
train = loadData('en', 'es')

In [12]:
train.head()

Unnamed: 0.1,Unnamed: 0,source,target
0,0,how the player was acquired; qualifier for P54...,forma en que fue adquirido el jugador; calific...
1,1,rivers and other outflows waterway names. If e...,río que drena el lago
2,2,amount of goods and services bought from other...,cantidad de bienes y servicios comprados a otr...
3,3,Wikimedia list related to this subject,lista de Wikimedia para el elemento
4,4,identifier for a unique bibliographic record i...,número de control del Online Computer Library...


In [13]:
# train.columns = ['ID','source_lang', 'source', 'target_lang', 'target']

In [14]:
#Train
bilingual_dictionary = list(zip(train['source'],train['target']))
source_matrix, target_matrix = make_training_matrices(model['en'], model['es'], bilingual_dictionary)

666867it [01:12, 9194.10it/s]


In [15]:
transform = learn_transformation(source_matrix, target_matrix)

In [16]:
# np.savetxt('description_trans_transform_en_es.txt', transform)

In [17]:
print("Before trans:", np.mean(inner1d(target_matrix, source_matrix)))

Before trans: 0.01143723951168304


In [18]:
print("After trans:", np.mean(inner1d(normalized(target_matrix), np.matmul(transform, normalized(source_matrix).T).T)))

After trans: 0.7768910814759667


In [21]:
bilingual_dictionary = list(zip(test['source'],test['target']))

source_matrix_test, target_matrix_test = make_training_matrices(model['en'], model['es'], bilingual_dictionary)


0it [00:00, ?it/s][A
83it [00:00, 822.90it/s][A
184it [00:00, 870.45it/s][A
317it [00:00, 970.48it/s][A
411it [00:00, 955.42it/s][A
513it [00:00, 971.82it/s][A
622it [00:00, 1002.71it/s][A
731it [00:00, 1026.74it/s][A
829it [00:00, 993.56it/s] [A
927it [00:00, 962.71it/s][A
1021it [00:01, 913.95it/s][A
1141it [00:01, 984.10it/s][A
1247it [00:01, 1005.70it/s][A
1354it [00:01, 990.34it/s] [A
1454it [00:01, 934.78it/s][A
1549it [00:01, 932.64it/s][A
1643it [00:01, 923.85it/s][A
1736it [00:01, 903.71it/s][A
1831it [00:01, 914.34it/s][A
1923it [00:02, 394.47it/s][A
2049it [00:02, 496.55it/s][A
2136it [00:02, 516.20it/s][A
2232it [00:02, 598.34it/s][A
2339it [00:02, 667.06it/s][A
2443it [00:03, 747.13it/s][A
2534it [00:03, 701.86it/s][A
2646it [00:03, 787.92it/s][A
2775it [00:03, 890.91it/s][A
2899it [00:03, 972.29it/s][A
3009it [00:03, 1006.96it/s][A
3118it [00:03, 1015.45it/s][A
3227it [00:03, 1036.71it/s][A
3342it [00:03, 1065.95it/s][A
3453it [00:03, 105

KeyboardInterrupt: 

In [None]:
#before
target_matrix_test = normalized(target_matrix_test)
source_matrix_test = normalized(source_matrix_test)

print("Before trans:",np.mean(inner1d(target_matrix_test, source_matrix_test)))
#after
print("After trans:", np.mean(inner1d(target_matrix_test, np.matmul(transform, source_matrix_test.T).T)))