# Generate words to watch multilingual features



First, let's define a few simple functions... (from https://github.com/Babylonpartners/fastText_multilingual)

In [1]:
import numpy as np
from fasttext import FastVector

languages=['en','fr','it']
language_extended=['english','french','italian']
"""
to use this, you will need: 
1) alignment matrices from https://github.com/Babylonpartners/fastText_multilingual - place in alignemnt_matrices/
2) Vectors from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md - place in vectors/
"""

matrix_dir='alignment_matrices/'
dic_dir='vectors/wiki.'
rawdir='../data_clean/all_'
feadir='features/all_ww_'
infile='translations.tsv'

dictionary={}
filenames={}
outfiles={}
words={}
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

Function to load filenames and word vectors. Non-english vectors are aligned to english

In [3]:
def load_filenames():
    for lan,lext in zip(languages,language_extended):
        #load clean data files
        filenames[lan]=rawdir+lext+'.tsv'
        #load output feature files
        outfiles[lan]=feadir+lan+'.tsv'

def load_dictionaries():
    for lan in languages:
        #load word vector dictionaries
        dictionary[lan]= FastVector(vector_file=dic_dir+lan+'.vec')
        #aligning all vectors to engglish
        if lan !='en':
            dictionary[lan].apply_transform(matrix_dir+lan+'.txt')
            
def load_words():
    raw_words=[]
    with open(infile,'rU') as f:
        for line in f:
            row=line[:-1].split('\t')
            raw_words.append(row[0])
    for lan in languages:
        words[lan]=[]
    for w in raw_words:
        for lan in languages:
            if lan=='en':
                words[lan].append(dictionary[lan][w])
            else:
                words[lan].append(dictionary[lan][dictionary[lan].translate_nearest_neighbour(dictionary['en'][w])])


Finally, we represent sentences with the algined vectos, and save the word representations in output files

In [4]:
#first load variables and dictionaries
load_filenames()
load_dictionaries()
load_words()
l=len(words['en'])

print l

reading word vectors from vectors/wiki.en.vec
reading word vectors from vectors/wiki.fr.vec
reading word vectors from vectors/wiki.it.vec
7225
9948
5288
154
28464
59987
10358
89121
31870
45996
3051
6576
10661
357622
10607
7502
16846
5207
33203
13212
7043
422539
8567
5256
6793
569057
1172
1587
14091
18829
4045
8080
7816
9443
12368
15399
14593
3283
24264
862502
11722
6576
20773
837565
13194
13406
1960
1931
20200
28605
128944
82366
11535
11756
69902
58994
31529
28408
43530
48850
51448
113235
9555
24597
467808
111692
6183
4350
4728
11495
15125
13956
9508
9904
5547
9220
9508
9904
3336
5035
1137
996
529
714520
5575
5493
26103
23027
76874
15797
4173
15797
12652
17585
8362
8340
13690
13926
16140
20690
22622
17380
25208
732
52


In [5]:
#for every language, generate aligned vectors for clean sentences and write to file
for lan in languages:
    #open outfile for writing
    fo=open(outfiles[lan],'w')
    with open(filenames[lan]) as f:
        #for every sentence in the clean filename
        for line in f:
            #isolate the text
            row=line[:-1].split('\t')
            text=row[-2]
            #split into words
            rowwords=text.split()
            #populate vector with sum of word vectors
            outvec=np.zeros(l)
            count=0
            for w in rowwords:
                try:
                    vec=dictionary[lan][w]
                    for i in enumerate(words[lan]):
                            outvec[i[0]]+=FastVector.cosine_similarity(vec,i[1])
                            count=count+1      
                except:
                    try:
                        vec=dictionary[lan][w.lower()]
                        for i in enumerate(words[lan]):
                            outvec[i[0]]+=FastVector.cosine_similarity(vec,i[1])
                            count=count+1
                    except:
                        pass

            #divide by the total number of matching vetors
            if count>0:
                outvec /=count
            else:
                outvec=np.ones(l)
            outvec[outvec == np.nan] =0
            outvec[outvec == np.inf] = 0
            outvec[outvec == -np.inf] = 0
            #build a comma-separated string for the sentence vectors
            out=','.join([str(c) for c in outvec])
            #rebuild output string
            outstring='\t'.join(row[:-2])+'\t'+row[-1]+'\t'+out+'\n'
            #writes to file
            fo.write(outstring)
    fo.close()
            