In [1]:
import numpy as np
import pandas as pd
from pickle import load
import os
from sklearn.model_selection import train_test_split



In [2]:
def get_dict(file_name1,file_name2):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file1 = pd.read_csv(file_name1, delimiter=' ')
    my_file2 = pd.read_csv(file_name2, delimiter=' ')
    append_file=pd.DataFrame(np.concatenate((my_file1,my_file2)))
    dict_ = {}  # the english to french dictionary to be returned
    for i in range(len(append_file)):
        # indexing into the rows.
        en = append_file.loc[i][0]
        fr = append_file.loc[i][1]
        dict_[en] = fr

    return dict_

In [3]:
os.chdir(r'/kaggle/input/embedding-and-data')
train_data=get_dict('en-fr.train.txt','en-fr.test.txt')

embedding_en=load(open('en_embeddings.p','rb'))
embedding_fr=load(open('fr_embeddings.p','rb'))

In [4]:
#X_train, Y_train = get_matrices(train_data, embedding_en, embedding_fr)
#X_train.shape
len(train_data)

6500

In [5]:
def create_matric(data,embedding_en,embedding_fr):
    X=[]
    Y=[]
    index=[]
    english_word={i for i in embedding_en.keys()}
    frensh_word={i for i in embedding_fr.keys()}
    for idx,words in enumerate(data.items()):
        en_word=words[0]
        fr_word=words[1]
        if en_word in english_word and fr_word in frensh_word:
            X.append(embedding_en[en_word])
            Y.append(embedding_fr[fr_word])
            index.append(idx)
            
    X=np.array(X)
    Y=np.array(Y)
    
    return X,Y,index
            

In [6]:
data,label,index=create_matric(train_data,embedding_en,embedding_fr)
X_train,X_test,y_train,y_test=train_test_split(data,label,test_size=.2)

In [7]:
print(X_train.shape)

(5096, 300)


In [8]:
def pred(X,R):
    
    return np.dot(X,R)
    

In [9]:
def loss(X,Y,R):
    
    diff=pred(X,R)-Y
    
    squar=diff**2

    loss=np.sum(squar)/len(Y)
    
    return loss
    

In [10]:
def gredient(X,R,Y):
    G=np.dot(X.T,(np.dot(X,R)-Y))*2/len(Y)
    return G

In [11]:
def training(X,Y,num_iteration,alpha):
    np.random.seed(129)
    R=np.random.rand(X.shape[1],X.shape[1])
    for i in range(num_iteration):
        if i%25==0:
            print(f'loss in iteration number {i} is {loss(X,Y,R)}')
        gred=gredient(X,R,Y)
        R-=alpha*gred
    return R

In [12]:
def cosine_similarity (X,Y):

        dot=np.dot(X,Y)
        
        norm_X,norm_Y=np.linalg.norm(X),np.linalg.norm(Y)

        similarity=dot/(norm_X*norm_Y)
        return similarity


In [13]:
def get_words(index,Y):
    
    return(Y[[index]])

In [14]:
def KNN(x,y,k=1):
    similarity=[]
    for words in x:
        similarity.append(cosine_similarity(words,y))
        
    sort_idx=np.argsort(similarity)[::-1]    
    
    return sort_idx[:k]

In [15]:
R=training(X_train,y_train,400,0.8)

loss in iteration number 0 is 958.599928187228
loss in iteration number 25 is 97.27860340539138
loss in iteration number 50 is 26.264494442948283
loss in iteration number 75 is 9.517427492847206
loss in iteration number 100 is 4.231978909159563
loss in iteration number 125 is 2.2378542822680556
loss in iteration number 150 is 1.388812657802699
loss in iteration number 175 is 0.9948144228812664
loss in iteration number 200 is 0.7998853683323187
loss in iteration number 225 is 0.6985197560624948
loss in iteration number 250 is 0.6436374151948009
loss in iteration number 275 is 0.6129000447998622
loss in iteration number 300 is 0.5951775844531132
loss in iteration number 325 is 0.5846966772878475
loss in iteration number 350 is 0.5783584314043365
loss in iteration number 375 is 0.5744491795272226


In [16]:
def accuracy(X,Y):
    accuracy=0
    for index,x in enumerate(X):
        prediction=pred(x,R)
        idx=KNN(Y,prediction,k=1)
        if idx==index:
            accuracy+=1
    accuracy=accuracy/len(X)
    return accuracy

In [17]:
accuracy=accuracy(X_test,y_test)

In [18]:
accuracy

0.5934065934065934

In [19]:
def get_translate(word,all_data,index):
    en_words=list(all_data)
    prediction=pred(embedding_en[word],R)
    idx=KNN(label,prediction)
    idx=index[idx[0]]
    fr_word=all_data[en_words[idx]]
    return(fr_word)
    
    

In [20]:
get_translate('but',train_data,index)

'mais'