In [20]:
from nltk.corpus import brown
from nltk import FreqDist
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

### Step 1: Initializing W

In [21]:
# words from RG65
words = [
    "cord", "rooster", "noon", "fruit", "autograph", "automobile", "mound", "grin", "asylum",
    "asylum", "graveyard", "glass", "boy", "cushion", "monk", "asylum", "coast", "grin", 
    "shore", "monk", "boy", "automobile", "mound", "lad", "forest", "food", "cemetery", 
    "shore", "bird", "coast", "furnace", "crane", "smile", "voyage", "string", "furnace", 
    "shore", "wizard", "stove", "implement", "fruit", "monk", "madhouse", "magician", 
    "rooster", "jewel", "slave", "cemetery", "forest", "lad", "woodland", "oracle", "sage", 
    "cushion", "shore", "wizard", "graveyard", "rooster", "woodland", "voyage", "woodland", 
    "hill", "implement", "hill", "car", "cemetery", "glass", "magician", "crane", "brother", 
    "sage", "oracle", "bird", "bird", "food", "brother", "asylum", "furnace", "magician", 
    "hill", "cord", "glass", "grin", "serf", "journey", "autograph", "coast", "forest", 
    "implement", "cock", "boy", "cushion", "cemetery", "automobile", "midday", "gem", 
    "woodland", "journey", "mound", "jewel", "oracle", "implement", "lad", "wizard", "sage", 
    "crane", "cock", "fruit", "monk", "madhouse", "stove", "wizard", "mound", "string", 
    "tumbler", "smile", "slave", "voyage", "signature", "shore", "woodland", "tool", 
    "rooster", "lad", "pillow", "graveyard", "car", "noon", "jewel"]

In [22]:
words_lowercased = [word.lower() for word in brown.words()]
fdist = FreqDist(words_lowercased)
W = [word for word in fdist.most_common(5000)]


fdist = FreqDist(words)
words_freq = list(fdist.items())
existing_words = set([word[0] for word in W])

# trying not to add duplicates
for word in words_freq:
    if word[0] not in existing_words:
        W.append(word)
W = sorted(W, key=lambda x: x[1])
W_words = [word[0] for word in W]


pairs_in_W = [('cord', 'smile'), ('rooster', 'voyage'), ('noon', 'string'), ('fruit', 'furnace'), ('autograph', 'shore'), ('automobile', 'wizard'), ('mound', 'stove'), ('grin', 'implement'), ('asylum', 'fruit'), ('asylum', 'monk'), ('graveyard', 'madhouse'), ('glass', 'magician'), ('boy', 'rooster'), ('cushion', 'jewel'), ('monk', 'slave'), ('asylum', 'cemetery'), ('coast', 'forest'), ('grin', 'lad'), ('shore', 'woodland'), ('monk', 'oracle'), ('boy', 'sage'), ('automobile', 'cushion'), ('mound', 'shore'), ('lad', 'wizard'), ('forest', 'graveyard'), ('food', 'rooster'), ('cemetery', 'woodland'), ('shore', 'voyage'), ('bird', 'woodland'), ('coast', 'hill'), ('furnace', 'implement'), ('crane', 'rooster'), ('hill', 'woodland'), ('car', 'journey'), ('cemetery', 'mound'), ('glass', 'jewel'), ('magician', 'oracle'), ('crane', 'implement'), ('brother', 'lad'), ('sage', 'wizard'), ('oracle', 'sage'), ('bird', 'crane'), ('bird', 'cock'), ('food', 'fruit'), ('brother', 'monk'), ('asylum', 'madhouse'), ('furnace', 'stove'), ('magician', 'wizard'), ('hill', 'mound'), ('cord', 'string'), ('glass', 'tumbler'), ('grin', 'smile'), ('serf', 'slave'), ('journey', 'voyage'), ('autograph', 'signature'), ('coast', 'shore'), ('forest', 'woodland'), ('implement', 'tool'), ('cock', 'rooster'), ('boy', 'lad'), ('cushion', 'pillow'), ('automobile', 'car'), ('midday', 'noon'), ('gem', 'jewel')]
relevant_words = set(word for pair in pairs_in_W for word in pair)

Using Gensim to extract embeddings

In [23]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [24]:
# extracting pairs
word_pairs_with_scores = {
    ("cord", "smile"): 0.02,
    ("rooster", "voyage"): 0.04,
    ("noon", "string"): 0.04,
    ("fruit", "furnace"): 0.05,
    ("autograph", "shore"): 0.06,
    ("automobile", "wizard"): 0.11,
    ("mound", "stove"): 0.14,
    ("grin", "implement"): 0.18,
    ("asylum", "fruit"): 0.19,
    ("asylum", "monk"): 0.39,
    ("graveyard", "madhouse"): 0.42,
    ("glass", "magician"): 0.44,
    ("boy", "rooster"): 0.44,
    ("cushion", "jewel"): 0.45,
    ("monk", "slave"): 0.57,
    ("asylum", "cemetery"): 0.79,
    ("coast", "forest"): 0.88,
    ("grin", "lad"): 0.88,
    ("shore", "woodland"): 0.90,
    ("monk", "oracle"): 0.91,
    ("boy", "sage"): 0.96,
    ("automobile", "cushion"): 0.97,
    ("mound", "shore"): 0.97,
    ("lad", "wizard"): 0.99,
    ("forest", "graveyard"): 1.00,
    ("food", "rooster"): 1.09,
    ("cemetery", "woodland"): 1.18,
    ("shore", "voyage"): 1.22,
    ("bird", "woodland"): 1.24,
    ("coast", "hill"): 1.26,
    ("furnace", "implement"): 1.37,
    ("crane", "rooster"): 1.41,
    ("hill", "woodland"): 1.48,
    ("car", "journey"): 1.55,
    ("cemetery", "mound"): 1.69,
    ("glass", "jewel"): 1.78,
    ("magician", "oracle"): 1.82,
    ("crane", "implement"): 2.37,
    ("brother", "lad"): 2.41,
    ("sage", "wizard"): 2.46,
    ("oracle", "sage"): 2.61,
    ("bird", "crane"): 2.63,
    ("bird", "cock"): 2.63,
    ("food", "fruit"): 2.69,
    ("brother", "monk"): 2.74,
    ("asylum", "madhouse"): 3.04,
    ("furnace", "stove"): 3.11,
    ("magician", "wizard"): 3.21,
    ("hill", "mound"): 3.29,
    ("cord", "string"): 3.41,
    ("glass", "tumbler"): 3.45,
    ("grin", "smile"): 3.46,
    ("serf", "slave"): 3.46,
    ("journey", "voyage"): 3.58,
    ("autograph", "signature"): 3.59,
    ("coast", "shore"): 3.60,
    ("forest", "woodland"): 3.65,
    ("implement", "tool"): 3.66,
    ("cock", "rooster"): 3.68,
    ("boy", "lad"): 3.82,
    ("cushion", "pillow"): 3.84,
    ("cemetry", "graveyard"): 3.88,
    ("automobile", "car"): 3.92,
    ("midday", "noon"): 3.94,
    ("gem", "jewel"): 3.94}

updated_words_pairs_with_scores = {}
for words, score in word_pairs_with_scores.items():
    if words[0] in relevant_words and words[1] in relevant_words:
        updated_words_pairs_with_scores[words] = score

word_pairs_with_scores = updated_words_pairs_with_scores

### Step 3: Calculate cosine distance between each pair of word embeddings you have extracted, and report the Pearson correlation between word2vec-based and human similarities.

In [25]:
word2vec_similarities = []

for pair, _ in word_pairs_with_scores.items():
    word1, word2 = pair
    embedding1 = model[word1]
    embedding2 = model[word2]
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    word2vec_similarities.append(similarity)

word2vec_distances = [1 - s for s in word2vec_similarities]

In [26]:
human_similarities = list(word_pairs_with_scores.values())
correlation, _ = pearsonr(human_similarities, word2vec_similarities)
print(f"Pearson correlation between word2vec-based and human similarities: {correlation:.4f}")

Pearson correlation between word2vec-based and human similarities: 0.7627


### Step 4: Evaluate Analogies

In [32]:
# For semantic analogies
accuracy_semantic = model.evaluate_word_analogies('semantic-test.txt')
print('Word2Vec accuracy on semantic analogies:', accuracy_semantic[0])

# For syntactic analogies
accuracy_syntactic = model.evaluate_word_analogies('syntactic-test.txt')
print('Word2Vec accuracy on syntactic analogies:', accuracy_syntactic[0])

Word2Vec accuracy on semantic analogies: 0.7415366839976892
Word2Vec accuracy on syntactic analogies: 0.7390163934426229


#### Step 4.2: Evaluating Analogies for filtered tests (fairness)

In [34]:
# for semantic analogies
accuracy_semantic = model.evaluate_word_analogies('filtered-semantic-test.txt')
print('Word2Vec accuracy on semantic analogies:', accuracy_semantic[0])

# for syntactic analogies
accuracy_syntactic = model.evaluate_word_analogies('filtered-syntactic-test.txt')
print('Word2Vec accuracy on syntactic analogies:', accuracy_syntactic[0])

Word2Vec accuracy on semantic analogies: 0.8888888888888888
Word2Vec accuracy on syntactic analogies: 0.6787286063569682


#### NOTE: Analogy tests for LSA in the basic exercise file.