# Lesson 8 - Exercise 3

In [24]:
# Load corpus
corpus = open('ex_8_3_corpus.txt').read()

In [32]:
import nltk
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords

nltk.download('punkt')

# Text preprocessing
def preprocess_text(text):
    # stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    # tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens


[nltk_data] Downloading package punkt to /Users/maohieng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Preprocess the text and create a single list of tokens
tokens = []
for text in corpus:
    tokens.extend(preprocess_text(text))

In [34]:
from collections import Counter, defaultdict

# Get the top 50 most frequent words
word_counts = Counter(tokens)
top_words = [word for word, _ in word_counts.most_common(50)]

In [35]:
print(top_words)

['e', 'a', 't', 'n', 'i', 'o', 'r', 's', 'h', 'd', 'c', 'm', 'l', 'u', 'p', 'g', 'f', 'b', 'w', 'k', ',', 'y', '.', 'v', '1', '[', ']', '9', '5', '0', '3', '2', '4', '8', '(', '7', ')', '6', 'j', '``', '-', "'", 'q', 'é', '្', 'ា', 'â', 'ក', 'រ', 'x']


In [17]:
import numpy as np
import pandas as pd

# Build the term-term co-occurrence matrix
context_window = 4
co_occurrence = defaultdict(lambda: defaultdict(int))

for i, word in enumerate(tokens):
    if word in top_words:
        for j in range(max(0, i - context_window), min(len(tokens), i + context_window + 1)):
            if i != j and tokens[j] in top_words:
                co_occurrence[word][tokens[j]] += 1

# Convert the co-occurrence matrix into a DataFrame
co_occurrence_matrix = pd.DataFrame.from_dict(co_occurrence, orient='index', columns=top_words).fillna(0)

In [23]:
print(co_occurrence_matrix)

           insert  content  wikipedia  article  first  second  third
content       5.0      0.0        5.0      5.0    2.0     2.0    1.0
first         2.0      2.0        1.0      1.0    0.0     0.0    0.0
wikipedia     5.0      5.0        0.0      5.0    1.0     2.0    2.0
article       5.0      5.0        5.0      0.0    1.0     2.0    2.0
second        2.0      2.0        2.0      2.0    0.0     0.0    0.0
third         1.0      1.0        2.0      2.0    0.0     0.0    0.0
insert        0.0      5.0        5.0      5.0    2.0     2.0    1.0


In [18]:
# Compute the PPMI matrix
def compute_ppmi(matrix):
    total_sum = matrix.values.sum()
    word_sums = matrix.sum(axis=1).values
    context_sums = matrix.sum(axis=0).values
    ppmi_matrix = matrix.copy()

    for i, word_sum in enumerate(word_sums):
        for j, context_sum in enumerate(context_sums):
            joint_prob = matrix.iat[i, j] / total_sum
            word_prob = word_sum / total_sum
            context_prob = context_sum / total_sum

            if joint_prob > 0:
                ppmi = max(np.log2(joint_prob / (word_prob * context_prob)), 0)
                ppmi_matrix.iat[i, j] = ppmi
            else:
                ppmi_matrix.iat[i, j] = 0

    return ppmi_matrix

In [19]:
ppmi_matrix = compute_ppmi(co_occurrence_matrix)

In [20]:
# Display the PPMI matrix
print("PPMI Matrix:")
print(ppmi_matrix)

PPMI Matrix:
             insert   content  wikipedia   article     first    second  \
content    0.321928  0.000000   0.321928  0.321928  0.736966  0.321928   
first      0.736966  0.736966   0.000000  0.000000  0.000000  0.000000   
wikipedia  0.321928  0.321928   0.000000  0.321928  0.000000  0.321928   
article    0.321928  0.321928   0.321928  0.000000  0.000000  0.321928   
second     0.321928  0.321928   0.321928  0.321928  0.000000  0.000000   
third      0.000000  0.000000   0.736966  0.736966  0.000000  0.000000   
insert     0.000000  0.321928   0.321928  0.321928  0.736966  0.321928   

              third  
content    0.000000  
first      0.000000  
wikipedia  0.736966  
article    0.736966  
second     0.000000  
third      0.000000  
insert     0.000000  


In [21]:
# Save the matrices as CSV files
co_occurrence_matrix.to_csv("co_occurrence_matrix.csv", index=True)
ppmi_matrix.to_csv("ppmi_matrix.csv", index=True)

print("Co-occurrence and PPMI matrices saved as CSV files.")

Co-occurrence and PPMI matrices saved as CSV files.


## Word2Vec Skip-gram with Negative Sampling (SGNS)

In [None]:
# Word2Vec model using SGNS
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
