In [1]:
from gensim.models import KeyedVectors
import numpy as np


en_emb = KeyedVectors.load_word2vec_format("cc.en.300.vec")
fr_emb = KeyedVectors.load_word2vec_format("cc.fr.300.vec")

In [2]:
august_embedding = en_emb["august"]
august_embedding.shape, august_embedding[:5]

((300,), array([-0.0522,  0.0364, -0.1252,  0.0053,  0.0382], dtype=float32))

In [3]:
en_emb.most_similar([august_embedding])

[('august', 1.0000001192092896),
 ('september', 0.8252838253974915),
 ('october', 0.8111194372177124),
 ('june', 0.8050148487091064),
 ('july', 0.7970553636550903),
 ('november', 0.7883636355400085),
 ('february', 0.7831972241401672),
 ('december', 0.7824541330337524),
 ('january', 0.774315595626831),
 ('april', 0.7621644139289856)]

In [4]:
en_emb.most_similar([august_embedding], topn=3)

[('august', 1.0000001192092896),
 ('september', 0.8252838253974915),
 ('october', 0.8111194372177124)]

In [5]:
en_emb[["august", "september"]].shape

(2, 300)

In [6]:
fr_emb.most_similar([fr_emb["aout"]])

[('aout', 1.0),
 ('Aout', 0.8249963521957397),
 ('juillet', 0.8109882473945618),
 ('fevrier', 0.8072444200515747),
 ('septembre', 0.7838519811630249),
 ('août', 0.779176652431488),
 ('juin', 0.7692081332206726),
 ('octobre', 0.7597455382347107),
 ('decembre', 0.7595792412757874),
 ('avril', 0.7390779256820679)]

In [7]:
fr_emb.most_similar([en_emb["august"]])

[('2003Pays', 0.23082853853702545),
 ('Montsoriu', 0.22505582869052887),
 ('2015Pays', 0.22218404710292816),
 ('2013Genre', 0.2095685452222824),
 ('AdiCloud', 0.20186512172222137),
 ('Bagua', 0.20061466097831726),
 ('2003Paysans', 0.2001495510339737),
 ('ValenceLa', 0.2001475840806961),
 ('Luddites', 0.19998176395893097),
 ('Guadalquivir', 0.1987551599740982)]

In [8]:
def load_word_pairs(filename):
    en_fr_pairs = []
    en_vectors = []
    fr_vectors = []
    with open(filename, "r") as inpf:
        for line in inpf:
            en, fr = line.rstrip().split(" ")
            if en not in en_emb or fr not in fr_emb:
                continue
            en_fr_pairs.append((en, fr))
            en_vectors.append(en_emb[en])
            fr_vectors.append(fr_emb[fr])
    return en_fr_pairs, np.array(en_vectors), np.array(fr_vectors)

In [9]:
en_fr_train, X_train, Y_train = load_word_pairs("en-fr.train.txt")
en_fr_test, X_test, Y_test = load_word_pairs("en-fr.test.txt")

In [13]:
en_fr_test[:5]

[('torpedo', 'torpille'),
 ('torpedo', 'torpilles'),
 ('giovanni', 'giovanni'),
 ('chat', 'discuter'),
 ('chat', 'discussion')]

In [14]:
en_fr_train[33:44]

[('which', 'laquelle'),
 ('which', 'lequel'),
 ('also', 'aussi'),
 ('also', 'egalement'),
 ('but', 'mais'),
 ('have', 'avoir'),
 ('have', 'ont'),
 ('one', 'un'),
 ('one', 'une'),
 ('one', 'one'),
 ('new', 'nouveau')]

Let $x_i \in \mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \in \mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called [Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem):

$$W^*= \arg\min_W \sum_{i=1}^n\|Wx_i - y_i\|_2$$

or

$$W^*= \arg\min_W \|XW^T - Y\|_F$$

where $\|\cdot\|_F$ denotes Frobenius norm.

> **Note:** in second formula, $W$ and $x$ seem to have switched places. This happens because the $X$ matrix is composed of objects $x_i$ in *rows* not *columns*, i.e. it is kind of composed of $x_i^T$. This means that $X \in \mathbb{R}^{N \times D}$, where $N$ is the number of items and $D$ is the embedding dimensionality. The same is true for the $Y$.

$W^*= \arg\min_W \sum_{i=1}^n\|Wx_i - y_i\|_2$ looks like simple multiple linear regression without bias. The `sklearn` allows you to turn off the bias in `LinearRegression` via the `fit_intercept` argument (in fact they simply call bias the intercept). So let's code.

In [25]:
august = mapping.predict(en_emb["august"].reshape(1, -1))
fr_emb.most_similar(august)

[('aout', 0.7475447654724121),
 ('juin', 0.7295001149177551),
 ('juillet', 0.7226635813713074),
 ('septembre', 0.722636342048645),
 ('mars', 0.7154141068458557),
 ('octobre', 0.7128994464874268),
 ('novembre', 0.7042980194091797),
 ('février', 0.7007734775543213),
 ('avril', 0.699772298336029),
 ('janvier', 0.6992713809013367)]

In [19]:
from sklearn.linear_model import LinearRegression

mapping = LinearRegression(fit_intercept=False)

mapping.fit(X_train, Y_train)
Y_pred = mapping.predict(X_test)


def precision(pairs, mapped_vectors, topn=1):

    assert len(pairs) == len(mapped_vectors)
    total = len(pairs)
    correct = 0
    
    for i in range(total):
        pair = pairs[i]
        predicted_vector = mapped_vectors[i]
    
        neighbors = fr_emb.most_similar([predicted_vector], topn=topn)
        
        target_word = pair[1]  # Французское слово
        predicted_words = [word for word, _ in neighbors]
        
        if target_word in predicted_words:
            correct += 1
    return correct / total



In [18]:
# Тестовых данных
precision_top1 = precision(en_fr_test, Y_pred, 1)
precision_top5 = precision(en_fr_test, Y_pred, 5)
precision_top10 = precision(en_fr_test, Y_pred, 10)

print(f"Precision@1: {precision_top1:.4f}")
print(f"Precision@5: {precision_top5:.4f}")
print(f"Precision@10: {precision_top10:.4f}")

# SVD
import numpy as np


cross_covariance = np.dot(X_train.T, Y_train)
U, _, Vt = np.linalg.svd(cross_covariance, full_matrices=False)
mapping_svd = np.dot(U, Vt)

Y_pred_svd = np.dot(X_test, mapping_svd)
precision_svd_top1 = precision(en_fr_test, Y_pred_svd, 1)
precision_svd_top5 = precision(en_fr_test, Y_pred_svd, 5)
precision_svd_top10 = precision(en_fr_test, Y_pred_svd, 10)

print("SVD: ")
print(f"Precision@1: {precision_svd_top1:.4f}")
print(f"Precision@5: {precision_svd_top5:.4f}")
print(f"Precision@10: {precision_svd_top10:.4f}")

Precision@1: 0.3403
Precision@5: 0.5994
Precision@10: 0.6830
SVD: 
Precision@1: 0.3467
Precision@5: 0.6454
Precision@10: 0.7245


In [26]:
assert precision([("august", "aout")], august, topn=5) == 1.0
assert precision([("august", "aout")], august, topn=9) == 1.0
assert precision([("august", "aout")], august, topn=10) == 1.0

In [27]:
assert precision(en_fr_test[:100], X_test[:100]) == 0.0
assert precision(en_fr_test[:100], Y_test[:100]) == 1.0

It can be shown that a self-consistent linear mapping between semantic spaces should be orthogonal. 
We can restrict transform $W$ to be orthogonal. Then we will solve next problem:

$$(W^T)^*= \arg\min_{W^T} \|XW^T - Y\|_F \text{, where: } W^TW = I$$

$$I \text{- identity matrix}$$

Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:
$$X^TY=U\Sigma V^T\text{, singular value decompostion}$$
$$(W^T)^*=UV^T$$

In [28]:
cross_covariance = np.dot(X_train.T, Y_train)

In [29]:
U, S, Vt = np.linalg.svd(cross_covariance, full_matrices=False)

In [30]:
mapping_svd = np.dot(U, Vt)

In [31]:
orthogonality_check = np.dot(mapping_svd, mapping_svd.T)
identity_diff = orthogonality_check - np.eye(orthogonality_check.shape[0])
orthogonality_error = np.max(np.abs(identity_diff))

In [32]:
Y_pred_svd = np.dot(X_test, mapping_svd)

In [33]:
precision_svd_top1 = precision(en_fr_test, Y_pred_svd, 1)
precision_svd_top5 = precision(en_fr_test, Y_pred_svd, 5)
precision_svd_top10 = precision(en_fr_test, Y_pred_svd, 10)

In [34]:
if "august" in en_emb:
    august_emb = en_emb["august"]
    august_fr_emb = np.dot(august_emb, mapping_svd)
    august_neighbors = fr_emb.most_similar([august_fr_emb], topn=5)

In [36]:
fr_emb.most_similar([np.matmul(en_emb['august'], mapping_svd)])

[('aout', 0.6530280709266663),
 ('juin', 0.6380628943443298),
 ('juillet', 0.631451427936554),
 ('septembre', 0.6301831603050232),
 ('octobre', 0.6239124536514282),
 ('mars', 0.6188206672668457),
 ('août', 0.6144081354141235),
 ('novembre', 0.6125038862228394),
 ('fevrier', 0.6092208027839661),
 ('février', 0.6086474061012268)]

In [37]:
print(precision(en_fr_test[:100], np.matmul(X_test[:100], mapping_svd)))
print(precision(en_fr_test[:100], np.matmul(X_test[:100], mapping_svd), 5))


0.36
0.71


Now, let's build our word embeddings-based translator!'
'Now let's translate these sentences word-by-word. Before that, however, don't forget to tokenize your sentences. For that you may (or may not) find the `nltk.tokenize.WordPunctTokenizer` to be very useful.'
''
''
'

In [None]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer

def translate(sentence):
    """
    :args:
        sentence - sentence in English (str)
    :returns:
        translation - sentence in French (str)

    * find english embedding for each word in sentence
    * transform english embedding vector
    * find nearest french word and replace
    """


    translated = []
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(sentence)

    
    
    for token in tokens:
        if token in ".,!?;:\"'()[]{}":
            translated.append(token)
            continue
        token_lower = token.lower()

        if token_lower in en_emb:
            en_embedding = en_emb[token_lower]
            fr_embedding = np.dot(en_embedding, mapping_svd)
            most_similar = fr_emb.most_similar([fr_embedding], topn=1)
            fr_word = most_similar[0][0]
            if token[0].isupper() and len(fr_word) > 0:
                fr_word = fr_word[0].upper() + fr_word[1:] if len(fr_word) > 1 else fr_word.upper()
                
            translated.append(fr_word)
        else:
            token_variants = [token, token.lower(), token.capitalize(), token.upper()]
            found = False
            
            for variant in token_variants:
                if variant in en_emb:
                    en_embedding = en_emb[variant]
                    fr_embedding = np.dot(en_embedding, mapping_svd)
                    most_similar = fr_emb.most_similar([fr_embedding], topn=1)
                    fr_word = most_similar[0][0]
                    


                    if token[0].isupper() and len(fr_word) > 0:
                        fr_word = fr_word[0].upper() + fr_word[1:] if len(fr_word) > 1 else fr_word.upper()
                        
                    translated.append(fr_word)
                    found = True
                    break
            
            if not found:
                translated.append(token)
    
    return " ".join(translated)


english_sentences = [
    "Hello, how are you today?",
    "I like to read books about history.",
    "The weather is beautiful in Paris.",
    "Can you translate this document for me?",
    "She works as a software engineer."
]

print("Примеры переводов:")
print("-" * 50)

for sentence in english_sentences:
    translation = translate(sentence)
    print(f"EN: {sentence}")
    print(f"FR: {translation}")
    print("-" * 50)


def translate_improved(sentence):
    """
    Улучшенная версия функции перевода с дополнительной обработкой
    """
    import re
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'@\S+', '', sentence)
    basic_translation = translate(sentence)
    return basic_translation

print("\nПримеры улучшенных переводов:")
print("-" * 50)

for sentence in english_sentences:
    translation = translate_improved(sentence)
    print(f"EN: {sentence}")
    print(f"FR: {translation}")
    print("-" * 50)

Примеры переводов:
--------------------------------------------------
EN: Hello, how are you today?
FR: Bonjour , comment sont vous hui ?
--------------------------------------------------
EN: I like to read books about history.
FR: Sje veux amener lire livres racontant histoire .
--------------------------------------------------
EN: The weather is beautiful in Paris.
FR: Dans météo est magnifique dans Londre .
--------------------------------------------------
EN: Can you translate this document for me?
FR: Peut vous traduire cette document pour me ?
--------------------------------------------------
EN: She works as a software engineer.
FR: Elle œuvres comme un logiciels ingénieur .
--------------------------------------------------

Примеры улучшенных переводов:
--------------------------------------------------
EN: Hello, how are you today?
FR: Bonjour , comment sont vous hui ?
--------------------------------------------------
EN: I like to read books about history.
FR: Sje veux 