In [1]:
import json
import fasttext
import pandas as pd
import numpy as np
import re
import os
import unicodedata
from sklearn.model_selection import train_test_split

In [82]:
def apply_transform(vec, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform

    Transform can either be a string with a filename to a
    text file containing a ndarray (compat. with np.loadtxt)
    or a numpy ndarray.
    """
    transmat = np.loadtxt(transform)# if isinstance(transform, str) else transform
    return np.matmul(vec, transmat)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []
    
    len_bd = len(bilingual_dictionary)

    for i, (source, target) in enumerate(bilingual_dictionary):
        if i % 10000 == 0:
            print(f'\r{i + 1}/{len_bd} | {100 * (i + 1) / len_bd:.3f} %', end = '', flush = True)
        sourceVector = source_dictionary.get_sentence_vector(source.lower().strip().replace('_',' '))
        targetVector = target_dictionary.get_sentence_vector(target.lower().strip().replace('_',' '))
        source_matrix.append(sourceVector)
        target_matrix.append(targetVector)
        
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(target_matrix.transpose(), source_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [75]:
# df = pd.read_csv('sitelinks/wikidata_smaller.csv')

In [76]:
# df.head()

In [77]:
# df_en = df[df['wiki'].str.contains('enwiki')]

In [78]:
# df_es = df[df['wiki'].str.contains('eswiki')]
# df_es = df_es[~df_es['wiki'].isin(['sourceswiki', 'specieswiki'])]

In [7]:
# df_en.to_csv('sitelinks/en_sitelinks.csv',index=False)
# df_es.to_csv('sitelinks/es_sitelinks.csv',index=False)

In [8]:
# df_en = pd.read_csv('sitelinks/en_sitelinks.csv')
# df_es = pd.read_csv('sitelinks/es_sitelinks.csv')

In [9]:
# df_en_grouped = df_en.groupby(['wikidata_item'], sort=False)['content'].first()
# df_en_grouped = df_en_grouped.to_frame()

In [10]:
# df_es_grouped = df_es.groupby(['wikidata_item'], sort=False)['content'].first()
# df_es_grouped = df_es_grouped.to_frame()

In [12]:
# merged_wiki = pd.merge(left=df_en_grouped,right=df_es_grouped, left_on='wikidata_item', right_on='wikidata_item')
# merged_wiki.to_csv('sitelinks/en_es_sitelinks.csv')
# merged_wiki = pd.read_csv('sitelinks/en_es_sitelinks.csv')

In [17]:
# # merged_wiki = pd.read_csv('sitelinks/en_es_sitelinks.csv')
# merged_wiki['wiki_x']='en'
# merged_wiki['wiki_y']='es'

In [14]:
model = {}
model['en'] = fasttext.load_model('fast/wiki.en.bin')
model['es'] = fasttext.load_model('fast/wiki.es.bin')





In [42]:
# df = pd.concat([merged_wiki['wiki_x'], merged_wiki['content_x'],merged_wiki['wiki_y'], merged_wiki['content_y']],
#                axis=1, keys=['source_lang','source_content','target_lang','target_content'])

In [15]:
# # # df.to_csv('sitelinks/en_es_sitelinks.csv',index=False)
# df = pd.read_csv('sitelinks/en_es_sitelinks.csv')

In [18]:
# df.head()

In [19]:
# df['source_content'] = df['source_content'].str.replace('_',' ')
# df['target_content'] = df['target_content'].str.replace('_',' ')

In [20]:
# print(len(df))
# df = df.dropna()
# print(len(df))

In [65]:
def unicoder(value):
    clean = unicodedata.normalize('NFKC',value)
    return clean

In [66]:
# df.to_csv('sitelinks/en_es_sitelinks.csv',index=False)
train = pd.read_csv('sitelinks/en_es_sitelinks.csv')
train = train.drop(columns=['source_lang', 'target_lang'])
train = train.rename(columns={"source_content": "source", "target_content": "target"})

In [93]:
train['source'] = train['source'].apply(unicoder)
train['target'] = train['target'].apply(unicoder)

In [94]:
test = pd.read_csv('content_test.csv') #same for everyone

In [95]:
train.head()

Unnamed: 0,source,target
0,Asynchronous transfer mode,Modo de transferencia asíncrona
1,Aphelion,Afelio
2,Austria-Hungary,Imperio austrohúngaro
3,Al Capone,Al Capone
4,Cue sports,Billar


In [96]:
test.head()

Unnamed: 0,source,target
0,"Hopkins died without leaving a will, though hi...",Hopkins murió sin dejar testamento. Su fortuna...
1,"Grossman was born in Concord, Massachusetts, t...","Grossman Nació en Concord, Massachusetts, es h..."
2,The song was written in producer Finneas O'Con...,La canción fue escrita por el productor Finnea...
3,Shadow Warriors (Hattori Hanzō: Kage no Gundan...,Shadow Warriors (serie Hattori Hanzō: Kage no ...
4,ZF + ACω suffices to prove that the union of c...,ZF + ACω es suficiente para probar que la unió...


In [90]:
train[train['source'].str.contains("\n")]

Unnamed: 0,source,target
921408,"Royce da 5'9\""\nenwiki,1273903,Q267816,2004-12...","Royce da 5'9\""\neswiki,1020658,Q7104966,2007-0..."


In [84]:
train[train['target'].str.contains("\n")]

Unnamed: 0,source,target
457020,4 Segundos,"4\""\neswiki,2032120,Q1936108,2008-11-06 08:10:..."
697750,4.5-inch Mark 8 naval gun,"Cañón Mark 8 de 4,5\""\neswiki,4743588,Q6029481..."
921408,"Royce da 5'9\""\nenwiki,1273903,Q267816,2004-12...","Royce da 5'9\""\neswiki,1020658,Q7104966,2007-0..."


Dropping the rows that are not handled by the unicoder

In [97]:
train = train.drop([457020, 697750,921408])

In [99]:
train[train['target'].str.contains("\n")]

Unnamed: 0,source,target


In [100]:
#Train
bilingual_dictionary = list(zip(train['source'],train['target']))

In [101]:
source_matrix, target_matrix = make_training_matrices(model['en'], model['es'], bilingual_dictionary)

990001/995640 | 99.434 %

In [102]:
np.save('source_matrix_sitelinks',source_matrix)
np.save('target_matrix_sitelinks',target_matrix)

In [110]:
source_matrix = np.load('source_matrix_sitelinks.npy')
target_matrix = np.load('target_matrix_sitelinks.npy')

In [104]:
#Computing transform matrix
transform = learn_transformation(source_matrix, target_matrix)

In [105]:
# ## Save transform as .txt file
np.save('sitelinks_transform_en_es', transform)

In [106]:
transform = np.load('sitelinks_transform_en_es.npy')

In [None]:
#before
print("Before trans:",np.matmul(target_matrix, source_matrix.T).mean())
#after
print("After trans:",np.matmul(target_matrix, np.matmul(transform, source_matrix.T)).mean())

In [107]:
#Test
bilingual_dictionary = list(zip(test['source'],test['target']))
source_matrix_test, target_matrix_test = make_training_matrices(model['en'], model['es'], bilingual_dictionary)

70001/72972 | 95.929 %

In [108]:
np.save('source_matrix_test',source_matrix_test)
np.save('target_matrix_test',target_matrix_test)

In [111]:
source_matrix_test = np.load('source_matrix_test.npy')
target_matrix_test = np.load('target_matrix_test.npy')

In [112]:
#before
print("Before trans:",np.matmul(target_matrix_test, source_matrix_test.T).mean())
#after
print("After trans:",np.matmul(target_matrix_test, np.matmul(transform, source_matrix_test.T)).mean())

Before trans: 0.0009806056
After trans: 0.18403813
