### A translator based on matrix operation

#### 1. Read the data and build the dictionary and embedding

In [1]:
# word id sorted according to frequency
%load_ext autoreload
%autoreload
import unicodecsv as csv
import codecs
import numpy as np
from scipy.linalg import expm
from scipy.special import expit as sigmoid

In [2]:
Source = []
Target = []
WC_source = {} # word count
WC_target = {}
W2I_source = {'':0}
I2W_source = {0:''}
W2I_target = {'':0}
I2W_target = {0:''}

with codecs.open('data/training-es.txt','r') as f:
    data = csv.reader(f,delimiter=' ')
    for row in data:
        row = row[:30]
        Source.append(row)
        for w in row:
            if len(w)>0:
                if not w in WC_source:
                    WC_source[w] = 0
                WC_source[w] += 1

words = sorted(WC_source.keys(),key=WC_source.get)

n = 1
for w in words:
    W2I_source[w] = n
    I2W_source[n] = w
    n+=1
del words
del WC_source
nw_source = len(W2I_source)
print(len(Source), nw_source)


with codecs.open('data/training-en.txt','r') as f:
    data = csv.reader(f,delimiter=' ')
    for row in data:
        row = row[:30]
        Target.append(row)
        for w in row:
            if len(w)>0:
                if not w in WC_target:
                    WC_target[w] = 0
                WC_target[w] += 1

words = sorted(WC_target.keys(),key=WC_target.get)

n = 1
for w in words:
    W2I_target[w] = n
    I2W_target[n] = w
    n+=1

del words
del WC_target
nw_target = len(W2I_target)
print(len(Target), nw_target)

N = len(Source)
# get the maximum length of a sentence
maxL_source = np.max(map(len,Source))
maxL_target = np.max(map(len,Target))
print(maxL_source,maxL_target)

(9900, 687)
(9900, 514)
(30, 30)


In [3]:
# embedding using skip gram, i.e., the nearest
A_source = np.zeros([nw_source,nw_source])
A_target = np.zeros([nw_target,nw_target])
#A_source = np.diag(np.ones(nw_source))
#A_target = np.diag(np.ones(nw_target))
for row in Source:
    for i in range(0,maxL_source):
#        A_source[W2I_source[row[i]],W2I_source[row[i]]] += 1.0
#        A_source[W2I_source[row[i+1]],W2I_source[row[i]]] += 1.0
        if i < len(row)+2:
            if i == len(row) -1:
                A_source[W2I_source[row[i]],W2I_source['']] += 1.0
            elif i > len(row) -1:
                A_source[W2I_source[''],W2I_source['']] += 1.0
            else:
                A_source[W2I_source[row[i]],W2I_source[row[i+1]]] += 1.0

decay_rate = 1.0

for i in range(A_source.shape[0]):
    A_source[i,:] = A_source[i,:]/np.linalg.norm(A_source[i,:])
    
def series(A,n=2,beta=0.5):
    A += beta * A.dot(A)
    return A

#A_source = series(A_source)
A_source = expm(decay_rate*A_source)
np.fill_diagonal(A_source,0)
#A_source=np.triu(A_source,1)
#A_source = sigmoid(A_source)
#A_source = np.linalg.pinv(np.diag(np.ones(nw_source))-decay_rate*A_source)

for row in Target:
    for i in range(0,maxL_target):
#        A_target[W2I_target[row[i]],W2I_target[row[i]]] += 1.0
#        A_target[W2I_target[row[i+1]],W2I_target[row[i]]] += 1.0
        if i < len(row)+1:
            if i == len(row) -1:
                A_target[W2I_target[row[i]],W2I_target['']] += 1.0
            elif i > len(row) -1:
                A_target[W2I_target[''],W2I_target['']] += 1.0
            else:
                A_target[W2I_target[row[i]],W2I_target[row[i+1]]] += 1.0
                
for i in range(A_target.shape[0]):
    A_target[i,:] = A_target[i,:]/np.linalg.norm(A_target[i,:])
    
#A_target = series(A_target)
  
A_target = expm(decay_rate*A_target)
np.fill_diagonal(A_target,0)
#A_target=np.triu(A_target,1)
#A_target = sigmoid(A_target) 
#A_target = np.linalg.pinv(np.diag(np.ones(nw_target))-decay_rate*A_target)

In [4]:
# compress to get the embedding of dimenstion 32 for each word
from sklearn.decomposition import PCA, KernelPCA
word_emb_dim = 64
pca = KernelPCA(n_components=word_emb_dim, kernel='rbf')
Emb_source = pca.fit_transform(A_source)
Emb_target = pca.fit_transform(A_target)
for i in range(Emb_source.shape[0]):
    Emb_source[i,:] = Emb_source[i,:]/np.linalg.norm(Emb_source[i,:])
for i in range(Emb_target.shape[0]):
    Emb_target[i,:] = Emb_target[i,:]/np.linalg.norm(Emb_target[i,:])

print(Emb_source.shape)
print(Emb_target.shape)

(687, 64)
(514, 64)


In [5]:
# get the embedding of inputs X and outputs Y

X = np.zeros([N,maxL_target*word_emb_dim])
Y = np.zeros([N,maxL_target*word_emb_dim])
for i in range(N):
    for j in range(maxL_source):
        if j>=len(Source[i]):
            X[i,j*word_emb_dim:(j+1)*word_emb_dim] = Emb_source[0,:]
        else:
            X[i,j*word_emb_dim:(j+1)*word_emb_dim] = Emb_source[W2I_source[Source[i][j]],:]
for i in range(N):
    for j in range(maxL_target):
        if j>=len(Target[i]):
            Y[i,j*word_emb_dim:(j+1)*word_emb_dim] = Emb_target[0,:]
        else:
            Y[i,j*word_emb_dim:(j+1)*word_emb_dim] = Emb_target[W2I_target[Target[i][j]],:]     

In [6]:
from mltools.optimizers import adam

Loss = []
def callback(X,Y,M,B,t):
    if t % 50 == 0: 
        loss = np.mean((X.dot(M) + B -Y)**2)
        Loss.append(loss)
        print(('loss of step %d: ' % t) + str(loss))


In [7]:
# get the trasformation matrix M
#pca = PCA(n_components=1000)
#X1 = pca.fit_transform(X.T).T
#Y1 = pca.fit_transform(Y.T).T

time_compress_rate = 0.8

def compress_time(X,maxL_source,word_emb_dim,time_compress_rate):
    compressed_sentence_length= int(time_compress_rate*maxL_source)
    X_compressed = []
    for i in range(X.shape[0]):
        x = X[i,:].reshape([maxL_source,word_emb_dim]).T
        pca = PCA(n_components=compressed_sentence_length)
        x_new = pca.fit_transform(x).T
        for j in range(x_new.shape[0]):
            x_new[j, :] = x_new[j, :]/np.linalg.norm(x_new[j, :])
        X_compressed.append(x_new.flatten().tolist())
        
#        u,s,v = np.linalg.svd(x,full_matrices=True)
#        s = np.diag(s[:compressed_sentence_length])
#        x_new = u[:,:compressed_sentence_length].dot(s.dot(v[:compressed_sentence_length,:])).T
#        for j in range(x_new.shape[0]):
#            x_new[j, :] = x_new[j, :]/np.linalg.norm(x_new[j, :])
#        X[i,:]=x_new.flatten()
        #X_compressed.append(u[:,:compressed_sentence_length].dot(s).T.flatten().tolist())
    return np.array(X_compressed,dtype=float) #X
        
#X = compress_time(X,maxL_source,word_emb_dim,time_compress_rate)

invX = np.linalg.pinv(X)
M = invX.dot(Y)
loss = np.mean((X.dot(M) - Y)**2)
print loss
#M, B = adam(X,Y,M,B)
#loss = np.mean((X.dot(M) + B - Y)**2)
#print loss
#eta = 1e-2 # learning rate
#M, B = adam(X,Y,batch_size=10, eta=eta, n_iters=2000, callback=callback)
#print Loss
#Y = Y - B


0.00134638903527


In [8]:
nhidden = 512
U,s,V = np.linalg.svd(M,full_matrices=True)
M = U[:,:nhidden].dot(np.diag(s[:nhidden])).dot(V[:nhidden,:])
loss = np.mean((X.dot(M) - Y)**2)
print loss
#M, B = adam(X,Y,M,B)
#loss = np.mean((X.dot(M) + B - Y)**2)
#print loss

0.00194907996143


In [9]:
# translate test
def trans(s):
    if isinstance(s,unicode) or isinstance(s,str):
        s = s.split(u' ')[:30]
    x = np.zeros([maxL_source*word_emb_dim])
    for i in range(maxL_source):
        if i>=len(s):
            x[i*word_emb_dim:(i+1)*word_emb_dim] = Emb_source[0,:]
        else:
            x[i*word_emb_dim:(i+1)*word_emb_dim] = Emb_source[W2I_source[s[i]],:]
    #X = compress_time(X.reshape([1,X.shape[0]]),maxL_source,word_emb_dim,time_compress_rate).flatten()
    y = x.dot(M)
    W = []
    for i in range(maxL_target):
        W.append(I2W_target[np.argmax(Emb_target.dot(y[i*word_emb_dim:(i+1)*word_emb_dim]))])
#        W.append(I2W_target[np.argmin(((Emb_target-y[i*word_emb_dim:(i+1)*word_emb_dim])**2).mean(-1))])
    return u' '.join(W)

In [10]:
Test_source = []
Test_target = []
with codecs.open('data/training-es.txt','r') as f:
    data = csv.reader(f,delimiter=' ')
    for row in data:
        Test_source.append(row)
with codecs.open('data/training-en.txt','r') as f:
    data = csv.reader(f,delimiter=' ')
    for row in data:
        Test_target.append(row)
for i in range(200):
    print '_______________'
    print 'Player 1: '+ ' '.join(Test_source[i][:30])
    print 'Predicted: '+trans(Test_source[i][:30])
    print '     True: '+ ' '.join(Test_target[i][:30])

_______________
Player 1: '? le importar'ia darnos las llaves de la habitaci'on , por favor ?
Predicted: would you mind giving us our bags to the room form reception available                 
     True: would you mind giving us the keys to the room , please ?
_______________
Player 1: he hecho la reserva de una habitaci'on tranquila doble con tel'efono y televisi'on a nombre de Rosario Cabedo .
Predicted: I have made a reservation for a quiet , single room with a telephone and a tv for Rosario evening ?         
     True: I have made a reservation for a quiet , double room with a telephone and a tv for Rosario Cabedo .
_______________
Player 1: '? le importar'ia cambiarme a otra habitaci'on m'as tranquila ?
Predicted: would you mind giving us to a warmer room .                    
     True: would you mind moving me to a quieter room ?
_______________
Player 1: por favor , tengo reservada una habitaci'on .
Predicted: I have booked a room !                        
     True: I have bo

     True: is there a quiet room available ?
_______________
Player 1: tengo reservada una habitaci'on tranquila con televisi'on y una buena vista de la monta~na a nombre de Rosario Cantero .
Predicted: I have booked a quiet room with a tv and a good view of the mountain for Rosario Valls !          
     True: I have booked a quiet room with a tv and a good view of the mountain for Rosario Cantero .
_______________
Player 1: '? tiene libre una habitaci'on doble ?
Predicted: would you have a double room available .                      
     True: do you have a double room available ?
_______________
Player 1: hice una reserva , por favor .
Predicted: I made a reservation !                         
     True: I made a reservation .
_______________
Player 1: '? est'a apuntado el recibo del tel'efono ?
Predicted: is the phone bill included .                        
     True: is the phone bill included ?
_______________
Player 1: '? puede darnos usted la llave de la habitaci'on n'umero d

Player 1: he reservado una habitaci'on doble tranquila a nombre de Enrique Velasco .
Predicted: I have booked a quiet room rooms room for Enrique ?                   
     True: I have booked a quiet , double room for Enrique Velasco .
_______________
Player 1: '? podr'ia bajar mis bolsas ?
Predicted: could you put our bags travel make                       
     True: could you send my bags down ?
_______________
Player 1: por favor , '? le importar'ia darnos la llave de nuestra habitaci'on ?
Predicted: would you mind giving me our luggage my the week form please !                 
     True: would you mind giving us the key to our room , please ?
_______________
Player 1: por favor , quisi'eramos una habitaci'on doble para una noche .
Predicted: I want a double room room week a week form                    
     True: we want a double room for a night , please .
_______________
Player 1: por favor , quisiera cambiarme a otra habitaci'on con televisi'on .
Predicted: we want to move to

_______________
Player 1: por favor , reserv'e una habitaci'on .
Predicted: I with a room !  registration                       
     True: I booked a room .
_______________
Player 1: '? est'a incluido el desayuno ?
Predicted: is the phone bill included ?                        
     True: is breakfast included ?
_______________
Player 1: la habitaci'on es muy cara .
Predicted: the room is too cold .                        
     True: the room is very expensive .
_______________
Player 1: nos vamos a ir el d'ia trece de septiembre a las ocho menos cuarto de la ma~nana .
Predicted: we are leave on September the tomorrow at a quarter to twelve in the afternoon ?              
     True: we are leaving on September the thirteenth at a quarter to eight in the morning .
_______________
Player 1: por favor , '? me puede despertar a las once y media ?
Predicted: can you giving me up at at room five week form ! registration                 
     True: can you wake me up at half past eleven , p

In [11]:
a = np.random.randint(0,2,[3,3])

In [12]:
a

array([[0, 0, 1],
       [0, 1, 1],
       [0, 0, 0]])

In [13]:
a.dot(a)

array([[0, 0, 0],
       [0, 1, 1],
       [0, 0, 0]])

In [14]:
np.linalg.matrix_power(a,2)

array([[0, 0, 0],
       [0, 1, 1],
       [0, 0, 0]])

In [15]:
np.linalg.norm(Emb_target[2,:])

1.0

In [16]:
A_source[10,45]

3.5560899204185101e-06

In [17]:
np.triu(np.random.random([4,4]),1)

array([[ 0.        ,  0.91641113,  0.2202511 ,  0.2605876 ],
       [ 0.        ,  0.        ,  0.71160264,  0.25066674],
       [ 0.        ,  0.        ,  0.        ,  0.08854933],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])