In [14]:
import gensim
import pandas as pd
from gensim.models.word2vec import PathLineSentences
from collections import defaultdict

# read in two corpus c1=old_corpus, c2=modern_corpus by using PathLineSentences
c1 = PathLineSentences('/Users/nicolechen/UMSI/FALL23/SI671/final_project/trail/trial_data_public/corpora/german/corpus1/corpus1.txt')
c2 = PathLineSentences('/Users/nicolechen/UMSI/FALL23/SI671/final_project/trail/trial_data_public/corpora/german/corpus2/corpus2.txt')

In [15]:
# analyize frequent words in c1 and save word frequency above 40 as old_freq(list format) 
freqs_w1 = defaultdict(int)
for sentence in c1:
    for word in sentence:
        freqs_w1[word] = freqs_w1[word] + 1
freqs_w1_old = pd.Series(freqs_w1).sort_values(ascending=False)
old_freq = freqs_w1_old[freqs_w1_old >= 40]

In [16]:
# analyize frequent words in c2 and save word frequency above 40 as old_freq(list format) 
freqs_w2 = defaultdict(int)
for sentence in c2:
    for word in sentence:
        freqs_w2[word] = freqs_w2[word] + 1
freqs_w2_modern = pd.Series(freqs_w2).sort_values(ascending=False)
modern_freq = freqs_w2_modern[freqs_w2_modern >= 73]

In [17]:
# train c1 into Word2Vec model with presetting parameters
w1 = gensim.models.Word2Vec(c1,
                            sg=1,
                            hs=0,
                            negative=5,
                            sample=0.001,
                            vector_size=100, window=10, min_count=20, epochs=5, workers=40)

In [18]:
# train c2 into Word2Vec model with presetting parameters
w2 = gensim.models.Word2Vec(c2,
                            sg=1,
                            hs=0,
                            negative=5,
                            sample=0.001,
                            vector_size=100, window=10, min_count=0, epochs=5, workers=40)

In [19]:
# extract common word from modern and old corpus frequent words
common_word = sorted(list(set(modern_freq.keys()).intersection(old_freq.keys())))

In [20]:
# unzip common_word and save as w1_words and w2_words
w1_words, w1_matrix = zip(*[(word, w1.wv[word]) for word in common_word])
w2_words, w2_martix = zip(*[(word, w2.wv[word]) for word in common_word])

In [21]:
import numpy as np
import gensim.models as intersected_word2vec
from gensim.models import KeyedVectors

# get the common word list from the two models (I did this in the script above)
common_words = sorted(list(set(modern_freq.keys()).intersection(old_freq.keys())))

# get key, value pairs of index to key for each model to prepare for reverse lookup later
old_idx2key = {num: word for num, word in enumerate(w1.wv.index_to_key)}
modern_idx2key = {num: word for num, word in enumerate(w2.wv.index_to_key)}

# reverse the key, value pairs to get key to index pairs dictionaries for each model
old_key2item = {word: num for num, word in old_idx2key.items()}
modern_key2item = {word: num for num, word in modern_idx2key.items()}

# spot the common words in each model's vocabulary and get their indices
old_shared_indices = [old_key2item[word] for word in common_words]
modern_shared_indices = [modern_key2item[word] for word in common_words]

# turn original Word2Vec models into KeyedVectors objects to access the underlying vectors
old_vecs = w1.wv
modern_vecs = w2.wv

# extract the common word list vectors from each model
old_shared_vecs = old_vecs[old_shared_indices]
modern_shared_vecs = modern_vecs[modern_shared_indices]

# calculate the transformation matrix for the aligned space using the SVD method based on the formula of othorgonal procrustes problem(the W.T*q-W^(t+1))
# refer from LSCD paper
m = old_shared_vecs.T @ modern_shared_vecs
u, _, v = np.linalg.svd(m)
ortho = u @ v

# apply otherthogonal procrustes problem to the old model(Q^(t))
old_aligened = old_vecs.vectors.dot(ortho)
modern_aligened = modern_vecs.vectors.dot(ortho)

$$
\mathbf{Q}^{\text{opt}} = \arg \underbrace{\min}_{\left\{\mathbf{Q}^{\dagger} \left|{\mathbf{Q} = {\mathbf{I}}} \right. \right\}} \|\mathbf{W}^{(t)}\mathbf{Q} - \mathbf{W}^{(t+1)}\|_{F}
$$

In [22]:
# after alignment, we can get the aligned word list and aligned word vector of old and modern corpus
old_ind, old_vec = zip(*[(word, old_aligened[word]) for word in old_shared_indices])
modern_ind, modern_vec = zip(*[(word, modern_aligened[word]) for word in modern_shared_indices])

old_wordlist = [old_idx2key[idx] for idx in old_ind]
modern_wordlist = [modern_idx2key[idx] for idx in modern_ind]

# combine the aligned word list and aligned word vector of old and modern corpus
old_wordnvec = list(zip(old_wordlist, old_vec))
modern_wordnvec = list(zip(modern_wordlist, modern_vec))

In [23]:
# input the target words, seperated by '\t'
target_input = open('/Users/nicolechen/SemEval/LSCDiscovery/starting_kit/testsets/targets_input.tsv', 'r').readlines()
targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1]) for line in target_input]

# get the target word list
target_list = []
for line in targets:
    target_list.append(line[0])

# check the workd list len is the same as the target word list
len(target_list)

4385

In [24]:
# manipulate the target word list to get the aligned word list and aligned word vector of old and modern corpus
old_cd_vec = [i for i in old_wordnvec if i[0] in target_list]
modern_cd_vec = [i for i in modern_wordnvec if i[0] in target_list]

# get the aligned word vector of old and modern corpus
final_old_vec = [i[1] for i in old_cd_vec]
final_modern_vec = [i[1] for i in modern_cd_vec]

In [25]:
from scipy.spatial.distance import cosine as cosine_distance
from scipy.stats import spearmanr

# calculate the cosine distance between the aligned word vector of old and modern corpus
scores = {}
for i in range(len(final_old_vec)):
    distance = cosine_distance(final_old_vec[i], final_modern_vec[i])
    scores[old_cd_vec[i][0]] = distance
  
# read the target_scores.txt for spearman correlation
with open('/Users/nicolechen/SemEval/LSCDiscovery/dwug_es/target_scores.txt', 'r') as f:
    lines = f.readlines()
    golden_scores = [line.strip().split('\t') for line in lines]

# extract scores from target_scores.txt
golden = {}
for i in golden_scores:
    if i:
        golden[i[0]] = i[1]

# calculate the spearman correlation
corpus = []
gold = []
for i in golden.keys():
    if i in scores.keys():
        corpus.append(scores[i])
        gold.append(golden[i])
spearc, p = spearmanr(corpus, gold, nan_policy='omit')
spearc, p

(nan, nan)

In [26]:
scores

{'d': 0.3907788395881653, 'es': 0.5391152799129486}