In [1]:
import nltk
import re
import numpy as np
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from collections import Counter


In [2]:
for res in ["reuters", "punkt", "punkt_tab"]:
    try:
        nltk.data.find(res)
    except LookupError:
        nltk.download(res)

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/michaellacar/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaellacar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/michaellacar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
def load_reuters():
    corpus = []
    for fid in reuters.fileids():
        text = reuters.raw(fid).lower()
        text = re.sub(r"[^a-z\s]", "", text)
        tokens = word_tokenize(text)
        if len(tokens) > 2:
            corpus.append(tokens)
    return corpus

In [4]:
corpus = load_reuters()
corpus = corpus[:300]     # same MAX_DOCS
MIN_COUNT = 5             # same MIN_COUNT

all_tokens = [w for doc in corpus for w in doc]
word_counts = Counter(all_tokens)
vocabs = [w for w, c in word_counts.items() if c >= MIN_COUNT]

word2index = {w: i for i, w in enumerate(vocabs)}

In [8]:
import numpy as np

emb_skipgram = np.load("emb_skipgram.npy")
emb_neg = np.load("emb_neg.npy")
emb_glove = np.load("emb_glove.npy")
emb_glove_g = np.load("emb_glove_g.npy")

In [10]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

def load_wordsim353(path):
    pairs, scores = [], []
    with open(path, "r") as f:
        next(f)
        for line in f:
            w1, w2, score = line.lower().strip().split(",")
            pairs.append((w1, w2))
            scores.append(float(score))
    return pairs, np.array(scores)

pairs, y_true = load_wordsim353("wordsim353crowd.csv")


In [11]:
def evaluate_similarity(embeddings):
    y_pred, y_true_f = [], []

    for (w1, w2), score in zip(pairs, y_true):
        if w1 in word2index and w2 in word2index:
            v1 = embeddings[word2index[w1]]
            v2 = embeddings[word2index[w2]]
            y_pred.append(np.dot(v1, v2))
            y_true_f.append(score)

    corr, _ = spearmanr(y_pred, y_true_f)
    mse = mean_squared_error(y_true_f, y_pred)
    return corr, mse


In [12]:
corr_sg, mse_sg = evaluate_similarity(emb_skipgram)
corr_neg, mse_neg = evaluate_similarity(emb_neg)
corr_glove, mse_glove = evaluate_similarity(emb_glove)
corr_glove_g, mse_glove_g = evaluate_similarity(emb_glove_g)


In [13]:
import pandas as pd

table2 = pd.DataFrame({
    "Model": ["Skipgram", "NEG", "GloVe", "GloVe (gensim)", "Y_true"],
    "Spearman Correlation": [
        corr_sg, corr_neg, corr_glove, corr_glove_g, 1.0
    ],
    "MSE": [
        mse_sg, mse_neg, mse_glove, mse_glove_g, 0.0
    ]
})

table2


Unnamed: 0,Model,Spearman Correlation,MSE
0,Skipgram,-0.258402,24.469366
1,NEG,-0.08146,27.762825
2,GloVe,-0.010461,15.921308
3,GloVe (gensim),0.186735,26.344998
4,Y_true,1.0,0.0
