In [None]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

import numpy as np

import os

from scipy.stats import spearmanr 
from scipy.spatial.distance import cosine
from scipy.linalg import orthogonal_procrustes

import pandas as pd

# Load embeddings and a testset

In [None]:
coha1 =  KeyedVectors.load_word2vec_format(datapath(os.getcwd() + '/embeddings/coha1_win10-k5-dim100-ep30-iter1.sgns'),
                                           binary=False) 
coha2 =  KeyedVectors.load_word2vec_format(datapath(os.getcwd() + '/embeddings/coha2_win10-k5-dim100-ep30-iter1.sgns'),
                                           binary=False) 

We loaded embeddings trained on the __[English Semeval dataset](https://www.ims.uni-stuttgart.de/en/research/resources/corpora/sem-eval-ulscd-eng/)__, i.e. on two subsets of the Corpus of Historical American English (COHA)

`coha1` - embeddings trained on subset from 1810 to 1860

`coha2` - embeddings trained on subset from 1960 to 2010

The difference between corpora is reflected in difference between embeddings. Lets see, for example, how nearest neibours for word `pilot` changed over time. We can see that in the earlier corpus the word is associated with sea navigation, while in the later corpus the meaning shifted towards aircraft navigator.

In [None]:
coha1.similar_by_word("pilot")

In [None]:
coha2.similar_by_word("pilot")

**Your turn:** think about other English words that radically changed their meaning between the first half of 19th century and the second half of 20th century. Insert them into cells above to test your hypothesis.

You probably noted that word `plane` is presented in the embedding dictionary together with a part of speech tag `plane_nn`. This is because this word belongs to a SemEval testset. The corpus was preprocessed to use only required word forms.

Lets now load the whole testset, together with manually annotated change scores.

In [None]:
graded = pd.read_csv(os.getcwd() + '/targets/english/graded.txt', sep="\t", header=None, names=['word', 'truth'])
graded

# Jaccard distance
This method is based on computing Jaccard distance between sets of 10 nearest neighbors of x
(by cosine distance) in A and B. The Jaccard distance is computed as a intersection size divided by the union size.

In [None]:
# Function definition

def jaccard(word, emb1 = coha1, emb2 = coha2, nn = 10):
    # retrieve nearest neighbors
    nn1 = emb1.similar_by_word(word, nn)
    nn2 = emb2.similar_by_word(word, nn)
    
    # this method does not use similarity scores, only lists of words
    nn1 = set(n[0] for n in nn1)
    nn2 = set(n[0] for n in nn2)
    
    # compute Jaccard score
    jaccard = len(nn1.intersection(nn2)) / len(nn1.union(nn2))
    
    # in the Semeval dataset change scores are between 0 and 1
    # so that 0 means no change, 1 means the highest change
    # Jaccard score is reverse, 0 means the smallest overlap, i.e. the strongest change
    # thus we return 1 - jaccard as the final change score
    
    return 1 - jaccard
    

In [None]:
# compute for each word in the list
graded["jaccard"] = graded.apply(lambda row: jaccard(row.word), axis = 1)
graded

In [None]:
# Evaluate using Spearman Rank Correlation
spearmanr(graded.truth, graded.jaccard)


**Your turn:** Explore whether over number of nearest neighbors (smaller or greater than 10) would improve the method results.

# Global Anchors
Here, the the intersection of A and B
vocabularies (‘global anchors’, or VAB) is used. The degree of semantic
change is defined as the cosine distance between the vector of the cosine
similarities of x embedding in A to all words in VAB and the vector of the
cosine similarities of x embedding in B to all words in VAB;

In [None]:
# function definition
def glob_a(word, emb1 = coha1, emb2 = coha2):
    
    # intersection of two vocabularies
    VAB = list(set(emb1.index_to_key).intersection(emb2.index_to_key))
    
    
    # vectors of cosine similarities
    v1 = emb1.distances(word, VAB)
    v2 = emb2.distances(word, VAB)
    
    # second-order cosine distance
    return float(cosine(v1, v2))
                       

In [None]:
# compute for each word in the list
graded["glob_a"] = graded.apply(lambda row: glob_a(row.word), axis = 1)
graded

In [None]:
# Evaluate using Spearman Rank Correlation
spearmanr(graded.truth, graded.glob_a)

**Your turn:** Why, do you think, the correlation is so low in this case? Would it be possible to improve the method by curating VAB?



# Orthogonal alignment




In the methods above we used word embeddings only indirectly, by computing distances to other words within the same embedding space. This is because embeddings are trained independently and, due to stochastic nature of the training process, are not aligned.

Foe example, nearest neighbors for word 'cloud' are rather similar in `coha1` and `coha2`.

In [None]:
coha1.similar_by_word('cloud')

In [None]:
coha2.similar_by_word('cloud')

However, if we take a *vector* for this word from the first embedding space and try to find where it is located in the second embedding space, the nearest words look completely irrelevant.

In [None]:
coha2.similar_by_vector(coha1['cloud'])

Thus, we need to first *align* embedding spaces so that position of semantically similar words become close across embedding space.


In [None]:
# alignment is done using vocabularly intersection
VAB = list(set(coha1.index_to_key).intersection(coha2.index_to_key))
vectors1=coha1.vectors_for_all(VAB).vectors
vectors2=coha2.vectors_for_all(VAB).vectors

In [None]:
# matrix multiplication
m = vectors2.T.dot(vectors1)
# SVD decomposition
u, _, v = np.linalg.svd(m)
# Orthogonal transformation of the second matrix that makes it most similar to the first matrix
ortho = u.dot(v)

In [None]:
# transforming embedding space using the orthogonal matrix
coha2.vectors = coha2.vectors.dot(ortho)

In [None]:
# check that now we can query coha2 embeddings using vectors from coha1 embedding space
coha2.similar_by_vector(coha1['cloud'])

Now we can measure the degree of semantic change directly using cosine similarities between vectors from `coha1` and `coha2`

In [None]:
graded["align_cos"] = graded.apply(lambda row: cosine(coha1[row.word], coha2[row.word]), axis = 1)
graded

In [None]:
spearmanr(graded.truth, graded.align_cos)


# Incremental training



Instead of aligning the models post-factum, we can train them *incrementally*, i.e. by initializaing the model for a later time period with weights from the model trained on an ealier time period. 

In [None]:
coha1i =  KeyedVectors.load_word2vec_format(datapath(os.getcwd() + '/embeddings/coha1init_win10-k5-dim100-ep30-iter1.sgns'),
                                           binary=False) 
coha2i =  KeyedVectors.load_word2vec_format(datapath(os.getcwd() + '/embeddings/coha2init_win10-k5-dim100-ep30-iter1.sgns'),
                                           binary=False) 

In this case models are already aligned

In [None]:
coha2i.similar_by_vector(coha1i['cloud'])

and similarities can be computed directly

In [None]:
graded["init_cos"] = graded.apply(lambda row: cosine(coha1i[row.word], coha2i[row.word]), axis = 1)
graded

In [None]:
spearmanr(graded.truth, graded.init_cos)

**Your turn**: now we have results from 4 different methods. Can you check, which of these results are most similar to each other? And why?