In [11]:
# Load packages
import gensim
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from gensim.models.word2vec import PathLineSentences
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [12]:
# Load word2vec model
c1 = PathLineSentences('/Users/nicolechen/SemEval/LSCDiscovery/starting_kit/corpora/old_corpus/dataset_XIX_lemmatized.txt')
c2 = PathLineSentences('/Users/nicolechen/SemEval/LSCDiscovery/starting_kit/corpora/modern_corpus/modern_corpus_lemmatized.txt')

In [13]:
# train c1 into Word2Vec model with presetting parameters
w1_w2v = gensim.models.Word2Vec(c1,
                            sg=1,
                            hs=0,
                            negative=5,
                            sample=0.001,
                            vector_size=100, window=10, min_count=10, epochs=5, workers=40)

In [14]:
# train c2 into Word2Vec model with presetting parameters
w2_w2v = gensim.models.Word2Vec(c2,
                            sg=1,
                            hs=0,
                            negative=5,
                            sample=0.001,
                            vector_size=100, window=10, min_count=10, epochs=5, workers=40)

In [15]:
# get word fequency of old corpus and filter out words that appear less than 40 times
freqs_w1 = defaultdict(int)
for sentence in c1:
    for word in sentence:
        freqs_w1[word] = freqs_w1[word] + 1
freqs_w1_old = pd.Series(freqs_w1).sort_values(ascending=False)
old_freq = freqs_w1_old[freqs_w1_old >= 40]

In [16]:
# get word fequency of modern corpus and filter out words that appear less than 73 times
freqs_w2 = defaultdict(int)
for sentence in c2:
    for word in sentence:
        freqs_w2[word] = freqs_w2[word] + 1
freqs_w2_modern = pd.Series(freqs_w2).sort_values(ascending=False)
modern_freq = freqs_w2_modern[freqs_w2_modern >= 73]

In [17]:
# get the common words between old and modern corpus
common_word = sorted(list(set(modern_freq.keys()).intersection(old_freq.keys())))
common_word

['!',
 '"',
 "'",
 '(',
 ')',
 '):',
 '*',
 ',',
 '-',
 '-en',
 '-no',
 '-y',
 '.',
 '..',
 '...',
 '/',
 '1',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1920',
 '1º',
 '2',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '34',
 '35',
 '36',
 '37',
 '4',
 '40',
 '5',
 '50',
 '54',
 '6',
 '60',
 '68',
 '7',
 '70',
 '74',
 '8',
 '80',
 '9',
 ':',
 ';',
 '?',
 'A',
 'A.',
 'Academia',
 'Administración',
 'Aduanas',
 'Ah',
 'Aires',
 'Al',
 'Alemania',
 'Alto',
 'América',
 'Américas',
 'Argentina',
 'Arias',
 'Asamblea',
 'Asia',
 'Audiencia',
 'Aunque',
 'Austria',
 'Año',
 'B',
 'B.',
 'Banco',
 'Barcelona',
 'Biblia',
 'Biblioteca',
 'Blanca',
 'Blanco',
 'Bolsa',
 'Brasil',
 'Bretaña',
 'Bueno',
 'Buenos',
 'C.',
 'Cabo',
 'Calderón',
 'Canal',
 'Capítulo',
 'Carlos',
 'Carta',
 'Casa',
 'Castro',
 'Cataluña',
 'Católica',
 'Centro',
 'Chile',
 'China',
 'Ciudad',
 'Civil',
 'Colegio',
 'Comandante',
 'Com

In [18]:
# import target word list and tag the _NOUN on each word
word_list = open('/Users/nicolechen/SemEval/LSCDiscovery/gold/target_words_evaluation_phase1.txt', 'r').readlines()

def get_word_list():
    for i in range(len(word_list)):
        word_list[i] = word_list[i].strip().split('\t')[0]
    return word_list

target_words = get_word_list()

In [19]:
# turn word2vec into KeyedVectors for later use
w1 = w1_w2v.wv
w2 = w2_w2v.wv

In [20]:
# get the similarity score between each word and its neighbor words
with open('/Users/nicolechen/SemEval/RNC/res/LSCD_baseline_answer.tsv', 'w') as out:
    for word in target_words:
        word = word.lower()

        n1 = [n[0] for n in w1.most_similar(word)]
        n2 = [n[0] for n in w2.most_similar(word)]

        same_words = set(n1).union(set(n2))#.union(set(n3))
        neighbor_words = [w for w in same_words if w in common_word]
    
        v1 = [w1.similarity(word, n) for n in neighbor_words]
        v2 = [w2.similarity(word, n) for n in neighbor_words]
    
        out.write(
            f"{word}\t{1 - cosine(v1, v2)}\n"
            )

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [21]:
# input model answer and true answer
model_answer = open('/Users/nicolechen/SemEval/RNC/res/LSCD_baseline_answer.tsv', 'r').readlines()
true_answer = open('/Users/nicolechen/SemEval/LSCDiscovery/dwug_es/target_scores.txt', 'r').readlines()

In [22]:
# get the lemma of each word in model answer and true answer
model_lemma = [i.strip().split('\t')[0] for i in model_answer]
true_lemma = [i.strip().split('\t')[0] for i in true_answer]

# check if the lemma of each word in model answer is in true answer
model_same_lemmas = [i for i in model_lemma if i in true_lemma]

In [23]:
# get the score of each word in model answer and true answer
def get_score(m_answer, t_answer):
    
    temp = {}
    for line in m_answer:
        new_line = line.strip().split('\t')
        lemma = new_line[0]
        temp[lemma] = []
        for i in range(1, len(new_line)):
            score = new_line[i]
            temp[lemma].append(float(score))
    
    m_predict, t_gold = [], []
    for line in t_answer:
        new_line = line.strip().split('\t')
        lemma = new_line[0]
        gold = []
        for i in range(1, len(new_line)):
            score = new_line[i]
            gold.append(float(score))
        try:
            predict = temp[lemma]
        except KeyError:
            continue
        assert len(predict) == len(gold)
        t_gold.append(gold)
        m_predict.append(predict)
    
    return (m_predict, t_gold)

model_predict, true_golden = get_score(model_answer, true_answer)

In [25]:
model_predict, true_golden

([[1.0],
  [0.9929425120353699],
  [0.9836294651031494],
  [0.9970173239707947],
  [0.9873230457305908],
  [0.9701247811317444],
  [0.8759374022483826],
  [0.9467539191246033],
  [0.945402204990387],
  [0.9929360747337341],
  [0.968864917755127],
  [0.9302493929862976],
  [0.9536077380180359],
  [0.9777503609657288],
  [0.9180338978767395],
  [0.9611276388168335],
  [0.8678371906280518],
  [0.8754040002822876],
  [0.8943954110145569],
  [0.9906224012374878],
  [1.0],
  [0.8531131148338318],
  [0.9214668869972229],
  [0.9262320399284363],
  [0.951206386089325],
  [0.9744316339492798],
  [0.8214532136917114],
  [0.8602867722511292],
  [0.9166483283042908],
  [1.0],
  [0.8444989323616028],
  [1.0],
  [0.9953421950340271],
  [0.7263227701187134],
  [0.9612362384796143],
  [0.9550046324729919],
  [0.9372791051864624],
  [0.9671924114227295],
  [0.8871167302131653],
  [0.8160391449928284],
  [0.8946050405502319],
  [1.0],
  [0.9536485075950623],
  [0.9108259081840515],
  [0.9615675806999207]

In [24]:
from scipy.stats import spearmanr

# calculate the correlation between model answer and true answer
def evaluation (model, true):
    # assert len(model[0]) == len(true[0])
    correlation = []
    for i in range(1):
        m_score = [element[i] for element in model]
        t_score = [element[i] for element in true]
        cor, p_val = spearmanr(m_score, t_score, nan_policy="omit")
        correlation.append((cor, p_val))
    return correlation

analysis = evaluation(model_predict, true_golden)
analysis

[(-0.007020738351393207, 0.9482381244770981)]