In [201]:
import numpy as np
import pandas as pd
import nltk.corpus
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
import math
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import random
from scipy.cluster.hierarchy import fcluster, linkage

In [135]:
stop = set(stopwords.words('english') + list(string.punctuation) ) 

In [136]:
words = nltk.corpus.brown.words()

In [128]:
len(words)

1161192

In [138]:
words = [i.lower() for i in words] #first lc since stop is case sensitive

In [139]:
words = [i for i in words if i.isalpha() and i not in stop and len(i)> 1]

In [140]:
len(words)

508631

#### Words and frequency (Same as before just another method)

In [142]:
N = len(words)
words1 = [] 
total = {}
for i in range(1,N-1):
    w = words[i]
    if w not in words1:
        words1.append(w)
        total[w] = 0
    total[w] = total[w]+1

In [143]:
sorted_counts = sorted(total.items(), key=lambda x: x[1], reverse=True)
sorted_dict = dict(sorted_counts)

In [147]:
list(sorted_dict.items())[:20]

[('one', 3292),
 ('would', 2714),
 ('said', 1961),
 ('new', 1635),
 ('could', 1601),
 ('time', 1598),
 ('two', 1412),
 ('may', 1402),
 ('first', 1361),
 ('like', 1292),
 ('man', 1207),
 ('even', 1170),
 ('made', 1125),
 ('also', 1069),
 ('many', 1030),
 ('must', 1013),
 ('af', 996),
 ('back', 966),
 ('years', 950),
 ('much', 937)]

#### Top Words in vocab and context

In [149]:
vocabulary = [w for w,count in list(sorted_dict.items())[:5000]]
contxt_words = [w for w,count in list(sorted_dict.items())[:1000]]

#### Window of four words

In [158]:
def get_counts(window_size=2):
    counts = {}
    J = [ j for j in range(-window_size,0)] + [j for j in range(1,window_size+1) ]
    for w0 in vocabulary :
        counts[w0] = {}
    for i in range(window_size, N - (window_size) ):
        w0 = words[i]
        if w0 in vocabulary:  #any word in vocab - w0
            for j in J:
                w = words[i+j]
                if w in contxt_words: #any word in contxt - w
                    if w not in counts[w0].keys():
                        counts[w0][w] = 1
                    else:
                        counts[w0][w] = counts[w0][w] + 1
    return counts

In [155]:
# probability over the context w0 , counts[w0][w] / sum of all counts[w0][]
def get_co_occurrence_dict(counts):
    probs = {}
    for w0 in counts.keys():
        sum = 0
        for w in counts[w0].keys():
            sum = sum+counts[w0][w]
        if sum >0:
            probs[w0] = {}
            for w in counts[w0].keys():
                probs[w0][w] = float(counts[w0][w]) / float(sum)
    return probs

In [156]:
#frequency of different contxt words,  sum all counts[][w] / sum all counts [][]
def get_contxt_distr(counts):
    counts_contxt = {}
    sum_contxt = 0 
    contxt_frequency = {}
    for w in contxt_words:
        counts_contxt[w] = 0
    for w0 in counts.keys():
        for w in counts[w0].keys():
            counts_contxt[w] = counts_contxt[w] + counts[w0][w]
            sum_contxt = sum_contxt + counts[w0][w]
    for w in contxt_words :
        contxt_frequency[w] = float (counts_contxt[w]) / float(sum_contxt)
    return contxt_frequency

In [161]:
counts = get_counts(2)
probability = get_co_occurrence_dict(counts)
contxt_frequency = get_contxt_distr(counts)

### Pointwise Mutual Information

In [163]:
#PMI pointwise mutual information
n_vocab = len(vocabulary)
n_contxt = len(contxt_words)
pmi = np.zeros((n_vocab, n_contxt))
for i in range(0,n_vocab):
    w0 = vocabulary[i]
    for w in probability[w0].keys():
        j = contxt_words.index(w)
        pmi[i,j] = max(0.0, np.log(probability[w0][w]) - np.log(contxt_frequency[w]) ) 

### a)

In [170]:
pca = PCA(n_components=100)
vecs = pca.fit_transform(pmi) #fitting into 100 dimens
for i in range(0,n_vocab):
        vecs[i] = vecs[i] / np.linalg.norm(vecs[i]) #normalizing

I implemented a word embedding by constructing a co-occurrence matrix, selected the most used words for the vocabulary(5000) and context words(1000), and calculated probabilities and distributions. Using PCA, I reduced the dimensionality of the Pointwise Mutual Information matrix to obtain a 100-dimensional word embedding, capturing semantic relationships between words. The resulting vectors represent contextual similarities in a more compact form, offering a meaningful representation of word semantics in the given corpus.

### b)

In [182]:
def word_NN(w):
    if not(w in vocabulary):
        print("Uknown Words")
        return
    v = vecs[vocabulary.index(w)]
    neighbor = 0 
    curr_dist = 1 - np.dot(v, vecs[0]) / (np.linalg.norm(v) * np.linalg.norm(vecs[0]) ) 
    for i in range(1, n_vocab):
        dist = 1 - np.dot(v, vecs[i]) / (np.linalg.norm(v) * np.linalg.norm(vecs[i]) ) 
        if (dist < curr_dist) and dist>0.0 :
            neighbor = i
            curr_dist = dist
    return vocabulary[neighbor]

In [189]:
selected_vocab = random.sample(vocabulary,25)
print(selected_vocab)

['milligrams', 'blonde', 'teach', 'art', 'study', 'roof', 'cents', 'pilot', 'baker', 'cady', 'extend', 'gross', 'used', 'seemed', 'glad', 'agreed', 'fair', 'category', 'classes', 'editor', 'mechanisms', 'specialists', 'heard', 'remainder', 'grant']


In [191]:
for w in selected_vocab:
    nn = word_NN(w)
    print(w, nn)

milligrams dairy
blonde hair
teach learn
art modern
study present
roof corner
cents pound
pilot expanded
baker quiney
cady companion
extend scope
gross returns
used use
seemed felt
glad see
agreed obliged
fair care
category occurrence
classes families
editor signed
mechanisms phases
specialists surplus
heard hear
remainder crops
grant state


The provided word pairs and their nearest neighbors suggest that the word embedding captures meaningful semantic relationships for some terms, but the quality of associations varies, indicating potential areas for improvement in the model or parameters. Further evaluation and adjustment may enhance the embedding's ability to represent coherent and contextually relevant word similarities.

        - "blonde hair" and "milligrams dairy" may represent associations between related concepts.
        -"study present" and "teach learn" suggest a connection between learning-related terms.
Some pairs appear less related or could be ambiguous:

        - "roof corner" and "cents pound" may lack clear semantic connections.
        - "agreed obliged" could be contextually dependent, and the association may vary.

### c)

In [204]:
num_clusters = 100
kmeans = KMeans(n_clusters=num_clusters)
cluster_assignments = kmeans.fit_predict(vecs)

clustered_vocab = {w: cluster_assignments[i] for i, w in enumerate(vocabulary)}

for cluster_num in range(num_clusters):
    cluster_words = [w for w, cluster in clustered_vocab.items() if cluster == cluster_num]
    print(f"Cluster {cluster_num + 1}: {', '.join(cluster_words[:5])}")

Cluster 1: closely, apply, experienced, advanced, agree
Cluster 2: movement, firm, attitude, race, attempt
Cluster 3: like, man, way, little, still
Cluster 4: side, line, feet, center, outside
Cluster 5: killed, hole, birds, putting, engine
Cluster 6: described, test, names, procedure, treated
Cluster 7: af, points, image, plane, fixed
Cluster 8: jobs, joined, automobile, congregation, includes
Cluster 9: money, tax, pay, amount, paid
Cluster 10: believed, die, grow, baby, mine
Cluster 11: politics, speaking, minds, version, centuries
Cluster 12: may, made, must, however, part
Cluster 13: silent, luck, lucy, angry, lovely
Cluster 14: public, government, social, national, local
Cluster 15: post, bank, forest, located, roads
Cluster 16: pressure, range, showed, normal, volume
Cluster 17: mere, dangerous, destroyed, inevitably, fail
Cluster 18: systems, designed, techniques, materials, permit
Cluster 19: time, last, year, week, month
Cluster 20: along, car, office, town, road
Cluster 21: 

I decided to use K-means clustering on word embeddings as it organizes words into clusters based on semantic similarity, simplifying the representation of high-dimensional spaces. The resulting clusters provide an interpretable structure, aiding in the understanding of semantic relationships among words. K-means is computationally efficient and scalable, making it suitable for large datasets and facilitating quantitative evaluation of the clustering results.

    The best clusters are : 
     - Cluster 14: public, government, social, national, local
     - Cluster 40: history, word, music, art, english
     - Cluster 43: school, college, university, students, class