In [1]:
# # !gunzip cx-corpora.en2es.text.json.gz
# !pip install --user pybind11
# !pip install fasttext
#!conda install -c conda-forge fasttext

In [35]:
import json
import fasttext
import pandas as pd
import numpy as np
import re
import os
import unicodedata
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from numpy.core.umath_tests import inner1d
from os import listdir

In [74]:
def apply_transform(vec, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform

    Transform can either be a string with a filename to a
    text file containing a ndarray (compat. with np.loadtxt)
    or a numpy ndarray.
    """
    transmat = np.loadtxt(transform)# if isinstance(transform, str) else transform
    return np.matmul(vec, transmat)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    
    len_bd = len(bilingual_dictionary)

    for i, (source, target) in tqdm(enumerate(bilingual_dictionary)):
#         print(f'\r{i + 1}/{len_bd} | {100 * (i + 1) / len_bd:.3f} %', end = '', flush = True)
        sourceVector = source_dictionary.get_sentence_vector(source.lower().strip().replace('_',' '))
        targetVector = target_dictionary.get_sentence_vector(target.lower().strip().replace('_',' '))
        source_matrix.append(sourceVector)
        target_matrix.append(targetVector)
        
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(target_matrix.transpose(), source_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

def cleanData(data):
    data_cleaned = []
    for source, target in zip(data['source_value'], data['target_value']):
        try:
            source_cleaned = unicodedata.normalize('NFD', source)
            target_cleaned = unicodedata.normalize('NFD', target)
            if not source == ''  and not target == '':
                data_cleaned.append([source_cleaned, target_cleaned])
        except:
            pass
    return data_cleaned


def check(source, target):
    val = [source, target]
    return val != sorted(val)

def loadData(source, target):
    if check(source, target):
        return loadData(target, source)
    
    file = '{}2{}.csv'.format(source, target)
    path = '/scratch/mje341/capstoneWikimedia/Training_description/files/'
    files = listdir(path)
    retVal = 'description_train_{}'.format(file)
    if retVal in files:
        return pd.read_csv(retVal)
    elif file in files:
        data = pd.read_csv(os.path.join(path, file), error_bad_lines = False)
        data = data.drop(['ID'], axis = 1)
        data = data.drop_duplicates()
        df = pd.DataFrame(cleanData(data), columns = ['source', 'target'])
        df.to_csv(os.path.join(path, retVal))
        return pd.read_csv(retVal)
    else:
        raise Exception(("{0} not in directory {1}".format(file, path)))

In [75]:
model = {}
source_language = 'es'
target_language = 'ja'
model['es'] = fasttext.load_model('/scratch/dev241/capstone/fast/wiki.es.bin')
model['ja'] = fasttext.load_model('/scratch/dev241/capstone/fast/wiki.ja.bin')





In [None]:
test = pd.read_csv('../ContentTranslation/content_test.csv') #same for everyone
train = loadData('en', 'es')

b'Skipping line 157010: expected 5 fields, saw 7\nSkipping line 161301: expected 5 fields, saw 6\n'
b'Skipping line 365761: expected 5 fields, saw 8\n'
b'Skipping line 393673: expected 5 fields, saw 6\nSkipping line 437600: expected 5 fields, saw 6\nSkipping line 507623: expected 5 fields, saw 6\n'
b'Skipping line 592281: expected 5 fields, saw 7\nSkipping line 649990: expected 5 fields, saw 6\n'
b'Skipping line 683775: expected 5 fields, saw 7\nSkipping line 697663: expected 5 fields, saw 7\nSkipping line 781863: expected 5 fields, saw 6\n'
b'Skipping line 807673: expected 5 fields, saw 6\n'
b'Skipping line 1008985: expected 5 fields, saw 6\n'
b'Skipping line 1099470: expected 5 fields, saw 9\nSkipping line 1109922: expected 5 fields, saw 6\nSkipping line 1139936: expected 5 fields, saw 6\n'
b'Skipping line 1216755: expected 5 fields, saw 6\nSkipping line 1231100: expected 5 fields, saw 7\nSkipping line 1294823: expected 5 fields, saw 6\n'
b'Skipping line 1552721: expected 5 fields, s

b'Skipping line 18247673: expected 5 fields, saw 6\n'
b'Skipping line 18545841: expected 5 fields, saw 6\n'
b'Skipping line 18726668: expected 5 fields, saw 6\n'
b'Skipping line 19123139: expected 5 fields, saw 6\n'
b'Skipping line 19173222: expected 5 fields, saw 8\nSkipping line 19177692: expected 5 fields, saw 6\nSkipping line 19202962: expected 5 fields, saw 9\n'
b'Skipping line 19467872: expected 5 fields, saw 9\nSkipping line 19487511: expected 5 fields, saw 6\n'
b'Skipping line 19939346: expected 5 fields, saw 10\nSkipping line 19984722: expected 5 fields, saw 6\n'
b'Skipping line 20183441: expected 5 fields, saw 7\n'
b'Skipping line 20298942: expected 5 fields, saw 6\n'
b'Skipping line 20481289: expected 5 fields, saw 6\n'
b'Skipping line 20763338: expected 5 fields, saw 6\nSkipping line 20767188: expected 5 fields, saw 6\nSkipping line 20767189: expected 5 fields, saw 6\n'
b'Skipping line 21122470: expected 5 fields, saw 6\nSkipping line 21183488: expected 5 fields, saw 7\nSki

In [None]:
train.head()

In [23]:
# train.columns = ['ID','source_lang', 'source', 'target_lang', 'target']

In [21]:
#Train
bilingual_dictionary = list(zip(train['source'],train['target']))
source_matrix, target_matrix = make_training_matrices(model['es'], model['ja'], bilingual_dictionary)


0it [00:00, ?it/s][A
101it [00:00, 998.53it/s][A
217it [00:00, 1040.67it/s][A
438it [00:00, 1236.69it/s][A
839it [00:00, 1560.25it/s][A
1382it [00:00, 1983.79it/s][A
1680it [00:00, 2187.32it/s][A
1975it [00:00, 2296.23it/s][A
2319it [00:00, 2549.21it/s][A
2908it [00:00, 3071.68it/s][A
3296it [00:01, 3008.88it/s][A
3654it [00:01, 2860.78it/s][A
3982it [00:01, 2683.21it/s][A
4292it [00:01, 2795.27it/s][A
4642it [00:01, 2974.74it/s][A
4959it [00:01, 2919.34it/s][A
5413it [00:01, 3267.04it/s][A
5980it [00:01, 3742.84it/s][A
6395it [00:01, 3606.53it/s][A
6786it [00:02, 3608.40it/s][A
7421it [00:02, 4145.15it/s][A
7879it [00:02, 3866.89it/s][A
8300it [00:02, 2789.20it/s][A
8646it [00:02, 2806.64it/s][A
9058it [00:02, 3102.19it/s][A
9411it [00:02, 2982.17it/s][A
9910it [00:03, 3391.02it/s][A
10438it [00:03, 3797.69it/s][A
10862it [00:03, 3533.97it/s][A
11316it [00:03, 3784.60it/s][A
11860it [00:03, 4162.32it/s][A
12309it [00:03, 4006.52it/s][A
12734it [00:03, 

122612it [00:31, 4095.25it/s][A
123066it [00:31, 3945.82it/s][A
123680it [00:31, 4416.79it/s][A
124163it [00:31, 4260.95it/s][A
124716it [00:32, 4574.55it/s][A
125250it [00:32, 4779.76it/s][A
125750it [00:32, 4444.64it/s][A
126282it [00:32, 4672.06it/s][A
126767it [00:32, 4261.42it/s][A
127212it [00:32, 3929.07it/s][A
127624it [00:32, 3460.36it/s][A
127994it [00:33, 2637.91it/s][A
128329it [00:33, 2815.83it/s][A
128823it [00:33, 3231.06it/s][A
129193it [00:33, 3294.68it/s][A
129702it [00:33, 3684.07it/s][A
130211it [00:33, 4010.92it/s][A
130648it [00:33, 3860.65it/s][A
131262it [00:33, 4339.34it/s][A
131734it [00:33, 3639.55it/s][A
132370it [00:34, 4174.06it/s][A
132848it [00:34, 2839.94it/s][A
133286it [00:34, 3174.53it/s][A
133787it [00:34, 3565.06it/s][A
134217it [00:34, 3550.41it/s][A
134743it [00:34, 3932.84it/s][A
135185it [00:34, 3866.43it/s][A
135762it [00:34, 4290.42it/s][A
136309it [00:35, 4579.61it/s][A
136800it [00:35, 3235.76it/s][A
137372it [

247156it [01:02, 3492.39it/s][A
247767it [01:02, 4007.24it/s][A
248255it [01:02, 3968.50it/s][A
248820it [01:02, 4356.53it/s][A
249325it [01:02, 4542.56it/s][A
249820it [01:02, 4506.27it/s][A
250499it [01:02, 3979.62it/s][A


In [24]:
transform = learn_transformation(source_matrix, target_matrix)

In [25]:
# np.savetxt('description_trans_transform_en_es.txt', transform)

In [26]:
print("Before trans:", np.mean(inner1d(target_matrix, source_matrix)))

Before trans: 0.0055835745616057905


In [27]:
print("After trans:", np.mean(inner1d(normalized(target_matrix), np.matmul(transform, normalized(source_matrix).T).T)))

After trans: 0.8276396193498955


In [28]:
bilingual_dictionary = list(zip(test['source'],test['target']))
source_matrix_test, target_matrix_test = make_training_matrices(model['es'], model['ja'], bilingual_dictionary)


0it [00:00, ?it/s][A
26it [00:00, 254.20it/s][A
34it [00:00, 135.79it/s][A
42it [00:00, 85.31it/s] [A
63it [00:00, 103.23it/s][A
78it [00:00, 98.67it/s] [A
98it [00:00, 115.39it/s][A
119it [00:00, 131.78it/s][A
137it [00:01, 131.88it/s][A
167it [00:01, 151.44it/s][A
185it [00:01, 116.94it/s][A
210it [00:01, 138.55it/s][A
229it [00:01, 150.71it/s][A
255it [00:01, 170.80it/s][A
285it [00:01, 195.08it/s][A
317it [00:01, 220.53it/s][A
343it [00:02, 186.51it/s][A
365it [00:02, 166.95it/s][A
385it [00:02, 152.84it/s][A
409it [00:02, 170.34it/s][A
429it [00:02, 173.65it/s][A
448it [00:02, 169.14it/s][A
466it [00:02, 149.22it/s][A
490it [00:03, 166.84it/s][A
524it [00:03, 194.11it/s][A
547it [00:03, 161.18it/s][A
566it [00:03, 135.06it/s][A
583it [00:03, 136.83it/s][A
606it [00:03, 155.49it/s][A
635it [00:03, 180.61it/s][A
663it [00:03, 200.41it/s][A
692it [00:04, 220.19it/s][A
717it [00:04, 205.93it/s][A
740it [00:04, 196.70it/s][A
764it [00:04, 207.91it/s]

6518it [00:38, 161.66it/s][A
6539it [00:38, 155.15it/s][A
6569it [00:38, 179.11it/s][A
6600it [00:38, 202.57it/s][A
6626it [00:38, 216.85it/s][A
6651it [00:38, 215.80it/s][A
6675it [00:38, 220.82it/s][A
6701it [00:39, 230.46it/s][A
6731it [00:39, 245.45it/s][A
6757it [00:39, 179.74it/s][A
6779it [00:39, 134.97it/s][A
6797it [00:39, 145.69it/s][A
6815it [00:39, 149.56it/s][A
6839it [00:39, 166.31it/s][A
6860it [00:40, 172.89it/s][A
6879it [00:40, 161.51it/s][A
6898it [00:40, 166.24it/s][A
6916it [00:40, 112.76it/s][A
6947it [00:40, 139.29it/s][A
6984it [00:40, 170.26it/s][A
7009it [00:40, 186.17it/s][A
7040it [00:40, 210.32it/s][A
7074it [00:41, 231.08it/s][A
7102it [00:41, 151.06it/s][A
7124it [00:41, 166.68it/s][A
7146it [00:41, 162.72it/s][A
7166it [00:41, 166.23it/s][A
7186it [00:41, 154.86it/s][A
7205it [00:42, 157.93it/s][A
7236it [00:42, 182.36it/s][A
7257it [00:42, 132.39it/s][A
7274it [00:42, 129.32it/s][A
7290it [00:42, 121.29it/s][A
7317it [00

KeyboardInterrupt: 

In [None]:
#before
target_matrix_test = normalized(target_matrix_test)
source_matrix_test = normalized(source_matrix_test)

print("Before trans:",np.mean(inner1d(target_matrix_test, source_matrix_test)))
#after
print("After trans:", np.mean(inner1d(target_matrix_test, np.matmul(transform, source_matrix_test.T).T)))