In [1]:
import nltk
import numpy as np

### Step 1.
Download the Brown corpus.

In [2]:
nltk.download('brown')
from nltk.corpus import brown
from nltk.probability import FreqDist

[nltk_data] Downloading package brown to /Users/naazsibia/nltk_data...
[nltk_data]   Package brown is already up-to-date!


### Step 2.
Extract the 5000 most common English words (denoted by W ) based on unigram
frequencies in the Brown corpus. Report the 5 most and least common words you have found
in the 5000 words. Update W by adding n words where n is the set of words in Table 1
of RG65 that were not included in the top 5000 words from the Brown corpus. Denote the
total number of words in W as |W |.

In [3]:
words_lower = [word.lower() for word in brown.words()] # ignore case
fdist = FreqDist(words_lower)
top5000 = fdist.most_common(5000)
top5 = [word[0] for word in top5000[:5]]
least5 = [word[0] for word in top5000[-5:]]
print("The 5 most common words are", top5)
print("The 5 least common words are", least5)


The 5 most common words are ['the', ',', '.', 'of', 'and']
The 5 least common words are ['cheek', 'awake', 'pursue', 'peered', 'crawled']


In [4]:
W = top5000.copy()

# Words from the Brown corpus
words = [
    "cord", "rooster", "noon", "fruit", "autograph", "automobile", "mound", "grin", "asylum",
    "asylum", "graveyard", "glass", "boy", "cushion", "monk", "asylum", "coast", "grin", 
    "shore", "monk", "boy", "automobile", "mound", "lad", "forest", "food", "cemetery", 
    "shore", "bird", "coast", "furnace", "crane", "smile", "voyage", "string", "furnace", 
    "shore", "wizard", "stove", "implement", "fruit", "monk", "madhouse", "magician", 
    "rooster", "jewel", "slave", "cemetery", "forest", "lad", "woodland", "oracle", "sage", 
    "cushion", "shore", "wizard", "graveyard", "rooster", "woodland", "voyage", "woodland", 
    "hill", "implement", "hill", "car", "cemetery", "glass", "magician", "crane", "brother", 
    "sage", "oracle", "bird", "bird", "food", "brother", "asylum", "furnace", "magician", 
    "hill", "cord", "glass", "grin", "serf", "journey", "autograph", "coast", "forest", 
    "implement", "cock", "boy", "cushion", "cemetery", "automobile", "midday", "gem", 
    "woodland", "journey", "mound", "jewel", "oracle", "implement", "lad", "wizard", "sage", 
    "crane", "cock", "fruit", "monk", "madhouse", "stove", "wizard", "mound", "string", 
    "tumbler", "smile", "slave", "voyage", "signature", "shore", "woodland", "tool", 
    "rooster", "lad", "pillow", "graveyard", "car", "noon", "jewel"]

fdist = FreqDist(words)
words_freq = list(fdist.items())
existing_words = set([word[0] for word in W])

# Trying not to add duplicates
for word in words_freq:
    if word[0] not in existing_words:
        W.append(word)
W = sorted(W, key=lambda x: x[1])

|W| = 5031 words

### Part 3
Construct a word-context vector model (denoted by M1) by collecting bigram counts
for words in W . The output should be a |W|×|W | matrix (consider using sparse matrices
for better efficiency), where each row is a word in W , and each column is a context in W
that precedes row words in sentences. For example, if the phrase taxi driver appears 5 times
in the entire corpus, then row taxi and column driver should have a value of 5 in the matrix

In [5]:
from scipy.sparse import dok_matrix

In [6]:
M1 = dok_matrix((len(W), len(W)), dtype=int) # sparce matrix
W_words = [word[0] for word in W]
word_to_index = {word: index for index, word in enumerate(W_words)}
prev_word = None
for word in brown.words():
    if prev_word in word_to_index and word in word_to_index:
        i = word_to_index[prev_word]
        j = word_to_index[word]
        M1[i, j] += 1
    prev_word = word

### Part 3
Compute positive pointwise mutual information on M1. Denote this model as M1+

In [7]:
from scipy.sparse import csr_matrix # compressed sparse row - faster operations

In [8]:
M1 = M1.tocsr()
word_probs = np.array(M1.sum(axis=1)).flatten() / M1.sum()
bigram_probs = M1 / M1.sum()

M1_coo = M1.tocoo() # coordinate format fast format for constructing sparse matrices

pmi_values = []
for i, j, val in zip(M1_coo.row, M1_coo.col, M1_coo.data):
    joint_prob = val / M1.sum()
    # 1e-10 added to prevent division by zero
    pmi = np.log2(joint_prob / (word_probs[i] * word_probs[j] + 1e-10) + 1e-10)
    pmi_values.append(pmi)

In [9]:
pmi_matrix = csr_matrix((pmi_values, (M1_coo.row, M1_coo.col)), shape = M1_coo.shape)

# convert to dense format for the next operation
pmi_dense = pmi_matrix.toarray()

# replace negative values because positive pmi
ppmi_matrix = np.maximum(pmi_dense, 0)

Construct a latent semantic model (denoted by M 2) by applying principal components analysis to M1+. The output should return 3 matrices, with different truncated dimenions at 10 (or a |W |×10 matrix, denoted by M 210), 100 (M 2100), and 300 (M 2300).

In [10]:
from sklearn.decomposition import PCA

# create PCA instances for 10, 100, and 300 dimensions
pca_10 = PCA(n_components=10)
pca_100 = PCA(n_components=100)
pca_300 = PCA(n_components=300)

# Apply PCA to the PPMI matrix
M_210 = pca_10.fit_transform(ppmi_matrix)
M_2100 = pca_100.fit_transform(ppmi_matrix)
M_2300 = pca_300.fit_transform(ppmi_matrix)

### Part 6.
Find all pairs of words in Table 1 of RG65 that are also available in W . Denote these pairs as P . Record the human-judged similarities of these word pairs from the table
and denote similarity values as S.

In [11]:
word_pairs_with_scores = {
    ("cord", "smile"): 0.02,
    ("rooster", "voyage"): 0.04,
    ("noon", "string"): 0.04,
    ("fruit", "furnace"): 0.05,
    ("autograph", "shore"): 0.06,
    ("automobile", "wizard"): 0.11,
    ("mound", "stove"): 0.14,
    ("grin", "implement"): 0.18,
    ("asylum", "fruit"): 0.19,
    ("asylum", "monk"): 0.39,
    ("graveyard", "madhouse"): 0.42,
    ("glass", "magician"): 0.44,
    ("boy", "rooster"): 0.44,
    ("cushion", "jewel"): 0.45,
    ("monk", "slave"): 0.57,
    ("asylum", "cemetery"): 0.79,
    ("coast", "forest"): 0.88,
    ("grin", "lad"): 0.88,
    ("shore", "woodland"): 0.90,
    ("monk", "oracle"): 0.91,
    ("boy", "sage"): 0.96,
    ("automobile", "cushion"): 0.97,
    ("mound", "shore"): 0.97,
    ("lad", "wizard"): 0.99,
    ("forest", "graveyard"): 1.00,
    ("food", "rooster"): 1.09,
    ("cemetery", "woodland"): 1.18,
    ("shore", "voyage"): 1.22,
    ("bird", "woodland"): 1.24,
    ("coast", "hill"): 1.26,
    ("furnace", "implement"): 1.37,
    ("crane", "rooster"): 1.41,
    ("hill", "woodland"): 1.48,
    ("car", "journey"): 1.55,
    ("cemetery", "mound"): 1.69,
    ("glass", "jewel"): 1.78,
    ("magician", "oracle"): 1.82,
    ("crane", "implement"): 2.37,
    ("brother", "lad"): 2.41,
    ("sage", "wizard"): 2.46,
    ("oracle", "sage"): 2.61,
    ("bird", "crane"): 2.63,
    ("bird", "cock"): 2.63,
    ("food", "fruit"): 2.69,
    ("brother", "monk"): 2.74,
    ("asylum", "madhouse"): 3.04,
    ("furnace", "stove"): 3.11,
    ("magician", "wizard"): 3.21,
    ("hill", "mound"): 3.29,
    ("cord", "string"): 3.41,
    ("glass", "tumbler"): 3.45,
    ("grin", "smile"): 3.46,
    ("serf", "slave"): 3.46,
    ("journey", "voyage"): 3.58,
    ("autograph", "signature"): 3.59,
    ("coast", "shore"): 3.60,
    ("forest", "woodland"): 3.65,
    ("implement", "tool"): 3.66,
    ("cock", "rooster"): 3.68,
    ("boy", "lad"): 3.82,
    ("cushion", "pillow"): 3.84,
    ("cemetry", "graveyard"): 3.88,
    ("automobile", "car"): 3.92,
    ("midday", "noon"): 3.94,
    ("gem", "jewel"): 3.94}

In [12]:
P = []  # list to store pairs of words that are in W
S = []  # list to store the human-judged similarity values of the word pairs

for pair, similarity in word_pairs_with_scores.items():
    word1, word2 = pair
    if word1 in W_words and word2 in W_words:
        P.append(pair)
        S.append(similarity)

print(P)
print(S)

[('cord', 'smile'), ('rooster', 'voyage'), ('noon', 'string'), ('fruit', 'furnace'), ('autograph', 'shore'), ('automobile', 'wizard'), ('mound', 'stove'), ('grin', 'implement'), ('asylum', 'fruit'), ('asylum', 'monk'), ('graveyard', 'madhouse'), ('glass', 'magician'), ('boy', 'rooster'), ('cushion', 'jewel'), ('monk', 'slave'), ('asylum', 'cemetery'), ('coast', 'forest'), ('grin', 'lad'), ('shore', 'woodland'), ('monk', 'oracle'), ('boy', 'sage'), ('automobile', 'cushion'), ('mound', 'shore'), ('lad', 'wizard'), ('forest', 'graveyard'), ('food', 'rooster'), ('cemetery', 'woodland'), ('shore', 'voyage'), ('bird', 'woodland'), ('coast', 'hill'), ('furnace', 'implement'), ('crane', 'rooster'), ('hill', 'woodland'), ('car', 'journey'), ('cemetery', 'mound'), ('glass', 'jewel'), ('magician', 'oracle'), ('crane', 'implement'), ('brother', 'lad'), ('sage', 'wizard'), ('oracle', 'sage'), ('bird', 'crane'), ('bird', 'cock'), ('food', 'fruit'), ('brother', 'monk'), ('asylum', 'madhouse'), ('furn

### Step 7.
Perform the following calculations on each of these models M 1, M1+, M_210, M_2100, M_2300, separately: Calculate cosine similarity between each pair of words in P , based on the
constructed word vectors. Record model-predicted similarities: SM_1, SM_210 , SM_2100 , SM_2300 

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarities(model, pairs, word_list):
    similarities = []
    for pair in pairs:
        idx1 = word_list.index(pair[0])
        idx2 = word_list.index(pair[1])
        
        vec1 = model[idx1].reshape(1, -1)  # reshape to make it 2D for cosine_similarity
        vec2 = model[idx2].reshape(1, -1)
        
        similarity = cosine_similarity(vec1, vec2)[0][0]
        similarities.append(similarity)
    return similarities

# For each model, compute similarities
SM_1 = calculate_similarities(M1, P, W_words)
SM_1_plus = calculate_similarities(ppmi_matrix, P, W_words)
SM_210 = calculate_similarities(M_210, P, W_words)
SM_2100 = calculate_similarities(M_2100, P, W_words)
SM_2300 = calculate_similarities(M_2300, P, W_words)

### Step 8.
Report Pearson correlation between S and each of the model-predicted similarities. Create a GitHub repository that implements all of your analyses; you will need this repo for
the next lab.

In [14]:
from scipy.stats import pearsonr

correlation_M_1 = pearsonr(S, SM_1)[0]
correlation_M_1_plus = pearsonr(S, SM_1_plus)[0]
correlation_M_210 = pearsonr(S, SM_210)[0]
correlation_M_2100 = pearsonr(S, SM_2100)[0]
correlation_M_2300 = pearsonr(S, SM_2300)[0]

print("Correlation with M_1:", correlation_M_1)
print("Correlation with M1+:", correlation_M_1_plus)
print("Correlation with M_210:", correlation_M_210)
print("Correlation with M_2100:", correlation_M_2100)
print("Correlation with M_2300:", correlation_M_2300)

Correlation with M_1: 0.14360014298240253
Correlation with M1+: 0.2421342509950241
Correlation with M_210: 0.167807592945941
Correlation with M_2100: 0.3328363250230704
Correlation with M_2300: 0.3515146719359085
