### Task 0 Before your go

> 1. Rename Assignment-02-###.ipynb where ### is your student ID.
> 2. The deadline of Assignment-02 is 23:59pm, 04-21-2024
> 3. In this assignment, you will use word embeddings to explore our Wikipedia dataset.

### Task 1 Train word embeddings using SGNS 
> Use our enwiki-train.json as training data. You can use the [Gensim tool](https://radimrehurek.com/gensim/models/word2vec.html). But it is recommended to implement by yourself. You should explain how hyper-parameters such as dimensionality of embeddings, window size, the parameter of negative sampling strategy, and initial learning rate have been chosen.

In [1]:
# import some necessary libraries
from typing import List, Dict, Callable
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import utils
import json
import random
import math
import numpy as np
import matplotlib.pyplot as plt
import heapq

In [2]:
# load the train and test data from the json file

# NOTE: The function is inherited from my solution of assignment 1
def load_json(file_path: str) -> List:
    """
    Fetch the data from `.json` file and concat them into a list.

    Input:
    - file_path: The relative file path of the `.json` file

    Returns:
    - join_data_list: A list containing the data, with the format of [{'title':<>, 'label':<>, 'text':<>}, {}, ...]
    """
    join_data_list = []
    with open(file_path, "r") as json_file:
        for line in json_file:
            line = line.strip()
            # guaranteen the line is not empty
            if line: 
                join_data_list.append(json.loads(line))
    return join_data_list

train_file_path, test_file_path = "enwiki-train.json", "enwiki-test.json"
train_data_list, test_data_list = map(load_json, [train_file_path, test_file_path])

class Corpus:
    def __iter__(self):
        for line in train_data_list:
            yield utils.simple_preprocess(line["text"])


In [3]:
class MyWord2Vec:
    def __init__(self, text: List[dict], dimensionality: int=100, window_size: int=5, negative_samples: int=5, lr: float=0.001) -> None:
        """
        Args:
        - text: The training data
        - dimensionality: The dimension of the word embeddings
        - window_size: The size of the context window
        - negative_samples: The number of negative samples
        - lr: Learning rate of the algorithm
        """
        self.dim = dimensionality
        self.window = window_size
        self.neg = negative_samples
        self.lr = lr
        self.__vocab = set()
        self.__word_frq = defaultdict(int)
        self.__word2idx = {}
        self.__idx2word = {}
        self.__embedding = None
        self.__context_words = []
        self.__context_targets = []
        self.__build(text)
        

    def __preprocess(self, text: List[str]) -> None:
        """
        Calculate the vocabulary and the frequency of each word in the training data, while maintaining the (idx, word) map.
        """
        for sample in text:
            words = sample["text"].split()
            for word in words:
                self.__vocab.add(word)
                self.__word_frq[word] += 1
        for idx, word in enumerate(self.__vocab):
            self.__word2idx[word] = idx
            self.__idx2word[idx] = word

    def __generate_training_data(self, text: List[str]) -> None:
        """
        Generate training data from each window and save them in `self.__context_words` and `self.__context_targets`
        """
        for sample in text:
            words = sample["text"].split()
            for i, curr_word in enumerate(words):
                # the "window" around the current world
                for j in range(max(0, i - self.window), min(i + self.window + 1, len(words))):
                    if i != j:
                        self.__context_words.append(self.__word2idx[curr_word])
                        self.__context_targets.append(self.__word2idx[words[j]])

    def __initialize_embedding(self) -> None:
        """
        Initialize the embedding matrix with random values
        """
        self.__embedding = np.random.uniform(-0.5 / self.dim, 0.5 / self.dim, size=(len(self.__vocab), self.dim))
    
    def __build(self, text: List[map]) -> None:
        """
        Compute and store the relevant information of the training data in the class
        """
        self.__preprocess(text)
        self.__generate_training_data(text)
        self.__initialize_embedding()

    def train(self, epochs: int=5) -> None: 
        for epoch in range(epochs):
            # learning rate decay
            learning_rate = self.lr * (1 - epoch / epochs)

            print("Training Epoch: %d" % (epoch + 1))

            for context_word, target_word in zip(self.__context_words, self.__context_targets):
                context_vector = self.__embedding[context_word]
                target_vector = self.__embedding[target_word]

                # positive sample update
                score = np.dot(target_vector, context_vector)
                exp_score = math.exp(score)
                grad_context = (exp_score / (1 + exp_score) - 1) * target_vector
                grad_target = (exp_score / (1 + exp_score) - 1) * context_vector
                self.__embedding[context_word] -= learning_rate * grad_context
                self.__embedding[target_word] -= learning_rate * grad_target

                # negative sample update
                for _ in range(self.neg):
                    negative_word = random.randint(0, len(self.__vocab) - 1)
                    if negative_word != target_word:
                        negative_vector = self.__embedding[negative_word]
                        score = np.dot(negative_vector, context_vector)
                        exp_score = math.exp(score)
                        grad_context = exp_score / (1 + exp_score) * negative_vector
                        grad_target = exp_score / (1 + exp_score) * context_vector
                        self.__embedding[context_word] -= learning_rate * grad_context
                        self.__embedding[target_word] -= learning_rate * grad_target


In [4]:
sentence = Corpus()
model = Word2Vec(
    sentences=sentence, vector_size=100, alpha=0.025, window=5, min_count=5, sample=0.001, 
    seed=1, workers=3, min_alpha=0.0001, sg=1, negative=5, ns_exponent=0.75, epochs=5, 
    sorted_vocab=1
)

### Task 2 Find similar/dissimilar word pairs

> Randomly generate 100, 1000, and 10000-word pairs from the vocabularies. For each set, print 5 closest word pairs and 5 furthest word pairs (you can use cosine-similarity to measure two words). Explain your results.

In [5]:
def generate_random_paris(samples: int):
    """
    Generate random indices without replacement, then pairs the indices to get word pairs
    """
    indices = random.sample(range(len(model.wv)), 2 * samples)
    indices1, indices2 = indices[:samples], indices[samples:]
    return [model.wv.index_to_key[i] for i in indices1], [model.wv.index_to_key[i] for i in indices2]

def find_closest_furthest(num: int=5, words1: List[str]=None, words2: List[str]=None) -> None:
    """
    Find the cloest/furthest word pairs using `model.wv.similarity`.

    Here a heap queue is used to reduce time complexity to $O(n\log k)$, where k denotes the `num`
    """
    heap = []
    for i in range(len(words1)):
        # compute the similarity and push it into the heap
        heapq.heappush(heap, (model.wv.similarity(words1[i], words2[i]), words1[i], words2[i]))
    return heapq.nlargest(num, heap), heapq.nsmallest(num, heap)[::-1]

def print_word_pairs(results: List[tuple], flag: str) -> None:
    """
    Print the result in formatted string
    """
    print("The 5 {:>8} word pairs:".format(flag))
    for result in results:
       print("Word pairs: ({:>15}, {:>15}) --> Similarity: {:>8.6f}".format(result[1], result[2], result[0]))


random.seed(408)
pairs = [100, 1000, 10000]
for pair in pairs:
    print("For {:>5} random pairs from the vocabularies:".format(pair))
    cloest, furthest = find_closest_furthest(5, *generate_random_paris(pair))
    print_word_pairs(cloest, "closest")
    print_word_pairs(furthest, "furthest")
    print("-" * 70)

For   100 random pairs from the vocabularies:
The 5  closest word pairs:
Word pairs: (      herodotus,       sorcerers) --> Similarity: 0.862411
Word pairs: (        bruxing,        plumbers) --> Similarity: 0.843897
Word pairs: (         shaded,           booby) --> Similarity: 0.818646
Word pairs: (        deflect,      comforting) --> Similarity: 0.815063
Word pairs: (           sown,     prohibitive) --> Similarity: 0.810534
The 5 furthest word pairs:
Word pairs: (  manifestation,        homeless) --> Similarity: 0.218703
Word pairs: (          harry,           kuala) --> Similarity: 0.197687
Word pairs: (     violations,         yevgeny) --> Similarity: 0.184321
Word pairs: (        trapped,          arabia) --> Similarity: 0.147944
Word pairs: (     separately,       communism) --> Similarity: 0.145576
----------------------------------------------------------------------
For  1000 random pairs from the vocabularies:
The 5  closest word pairs:
Word pairs: (        itching,       

### Task 3 Present a document as an embedding

> For each document, you have several choices to generate document embedding: 1. Use the average of embeddings of all words in each document; 2. Use the first paragraph’s words and take an average on these embeddings; 3. Use the doc2vec algorithm to present each document. Do the above for both training and testing dataset

In [84]:
####################################################################################
###        1. Use the average of embeddings of all words in each document        ###
####################################################################################
def print_document_embeddings(embeddings: Dict, display: int=3) -> None:
    """
    Print the first `display` embeddings, default value is 3
    """
    count = 0
    for embedding, title in embeddings:
        if count >= display:
            break
        count += 1
        print("Title: {}\nEmbedding: {}".format(title, embedding))
        print("-" * 70)

def print_training_testing(method: Callable):
    """
    Print the embeddings for training and testing dataset
    """
    print("For the training dataset:")
    print_document_embeddings(method())
    print("-" * 70)
    print("For the  testing dataset:")
    print_document_embeddings(method(test_data_list))

def average_all_words(data_list: List[Dict]=train_data_list, mode: str="title") -> List[tuple]:
    doc_embeddings = []
    if mode not in ["title", "label"]:
        raise ValueError("Please input a valid mode: ['title', 'label']")
    for line in data_list:
        line_text = utils.simple_preprocess(line["text"]) # preprocess the text as in class `Corpus`
        # doc title/label and doc embedding computed by averaging all words embeddings
        doc_info = line[mode]
        valid_word_embeddings = [model.wv[word] for word in line_text if word in model.wv]
        try:
            doc_embedding = sum(valid_word_embeddings) / len(valid_word_embeddings)
        except:
            doc_embedding = np.zeros(model.vector_size)
        # store the information in a dictionary
        doc_embeddings.append((doc_embedding, doc_info))

    return doc_embeddings

print_training_testing(average_all_words)


For the training dataset:
Title: Citizen_Kane
Embedding: [-0.2960049  -0.01423333  0.14721517  0.12139755 -0.17462535 -0.31337783
  0.18368073  0.34699985 -0.17591807 -0.19838323  0.02308599 -0.2998711
  0.01666955  0.13167198 -0.06327387 -0.08708493  0.14249018  0.01380194
 -0.21497476 -0.36701983 -0.06805244  0.14445285  0.08747505  0.04904256
  0.14309205  0.23049948 -0.23031184  0.10671046 -0.19837528 -0.1316407
 -0.1189249   0.16955267 -0.01272503 -0.04033708  0.02200293  0.13950647
 -0.17399263 -0.2843225   0.06627771 -0.166714    0.02661788 -0.25058183
 -0.40715316 -0.18351702  0.20407248 -0.10050888 -0.21705762  0.04414554
  0.09181677  0.04567541  0.03726172 -0.05376059 -0.31678638 -0.10022476
  0.01748321  0.04129237 -0.06482305  0.07935403  0.04977779  0.10498507
  0.22192508  0.15182945 -0.00224699 -0.06766742  0.07389529  0.03351407
 -0.22441798  0.21850444 -0.24207619  0.22126253  0.15975542  0.24951118
  0.06868735 -0.03211427  0.0378467   0.16712835  0.04484984  0.11626

In [79]:
####################################################################################
###  2. Use the first paragraph’s words and take an average on these embeddings  ###
####################################################################################
def find_first_paragraph(text: str):
    return text.split("\n")[0]

def average_first_para_words(data_list: List[Dict]=train_data_list, mode: str="title") -> List[tuple]:
    doc_embeddings = []
    if mode not in ["title", "label"]:
        raise ValueError("Please input a valid mode: ['title', 'label']")
    for line in data_list:
        doc_info = line[mode]
        valid_word_embeddings = [model.wv[word] for word in utils.simple_preprocess(find_first_paragraph(line["text"])) if word in model.wv]
        # since the number of words in the first paragraph is so small, it may occur `ZeroDivisonError` in the computation, here I use a 
        # try-except flow to handle this exception
        try:
            doc_embedding = sum(valid_word_embeddings) / len(valid_word_embeddings)
        except:
            doc_embedding = np.zeros(model.vector_size)
        doc_embeddings.append((doc_embedding, doc_info))

    return doc_embeddings

print_training_testing(average_first_para_words)

For the training dataset:
Title: Citizen_Kane
Embedding: [-0.27791345  0.03093471  0.25582522  0.10712704 -0.14603665 -0.39557108
  0.21222307  0.33513114 -0.27648464 -0.21422291 -0.04528995 -0.29881924
  0.027381    0.04872361 -0.08348218 -0.14888479  0.14539245 -0.05736789
 -0.12656057 -0.33338633 -0.1022215   0.22359993  0.07359979 -0.02213276
  0.12973529  0.27343455 -0.28661847  0.0712615  -0.20643789 -0.12265062
 -0.09869216  0.1762715  -0.04076541 -0.00666856  0.05650603  0.15743554
 -0.21651553 -0.27838734  0.02791196 -0.12518217  0.00946915 -0.2921295
 -0.46128526 -0.2091034   0.21861024 -0.04594442 -0.17977004  0.07432243
  0.00299822  0.04505717  0.09264806  0.01829637 -0.264414   -0.07577453
  0.02178697 -0.02288042 -0.06831323  0.09111134  0.03942756  0.14259186
  0.28405476  0.20258082  0.0321538  -0.17838289  0.09601185 -0.05103689
 -0.22543323  0.2711815  -0.2431821   0.28218347  0.14013624  0.22906084
  0.08563164 -0.0424281  -0.00476583  0.08102161  0.00983071  0.0745

In [28]:
####################################################################################
###             3. Use the doc2vec algorithm to present each document            ###
####################################################################################
def build_doc2vec() -> Doc2Vec:
    """
    Build the `Doc2Vec` model based on the `train_data_list`
    """
    tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate([line["text"] for line in train_data_list])]
    model = Doc2Vec(vector_size=100, window=5, min_count=5, epochs=10)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

doc2vec_model = build_doc2vec()

def doc2vec_model_embeddings(data_list: List[Dict]=train_data_list, mode: str="title") -> List[tuple]:
    embeddings = []
    if mode not in ["title", "label"]:
        raise ValueError("Please input a valid mode: ['title', 'label']")
    for line in data_list:
        embeddings.append((doc2vec_model.infer_vector(line["text"].split()), line[mode]))

    return embeddings

print_training_testing(doc2vec_model_embeddings)

For the training dataset:
Title: Citizen_Kane
Embedding: [-0.91197944  1.0177789  -0.87328035 -2.8970726  -0.22374134 -1.6525056
 -0.09971668  2.0014598   1.2450341  -1.5300171   3.6276844   1.0656625
  0.5074591  -3.4257743  -3.2505605  -2.8916092   2.9884274   4.4198318
  0.44310835  1.0066502  -0.46507087 -2.0297766   0.33820763  1.1429517
 -1.2871732  -1.0288019   2.1324449  -3.1070325   3.8972461  -0.584727
 -0.8729446  -0.89607465  2.514835    2.4750972  -0.9747412   2.2987094
 -0.68717337 -3.8264184  -3.9325871   3.0186417  -0.10682293 -2.2484853
  1.5026642  -6.136729    2.8145642  -3.7217798   2.4966438   0.64525604
  0.4302792  -3.3386102   2.6108866   0.08630164 -0.8840496  -2.3507173
  1.5288154   2.5773897   0.6166947   1.647873   -3.0836458  -1.5615094
 -2.6482725   1.2739259   2.9393506  -0.6705638  -0.39135727 -1.0394808
  0.9505518  -3.24473    -0.87178475 -2.5605524  -4.557449    2.2995675
 -4.329714   -4.0590553  -3.206194    3.2260227   5.4857607  -2.2458603
  1.481

### Task 4 Build classifier to test docs
> Build softmax regression model to classifier testing documents based on these training doc embeddings. Does it getting better than Naive Bayes'? (You have 3 models.)

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

class softmaxModel:
    def __init__(self, data: List[Dict], method: Callable) -> None:
        self.X = None
        self.y = None
        self.__model = None
        self.__ytest = None
        self.__ypred = None
        self.__method = method               # the three different models in task 3
        self.__train(method(data, "label"))  # use `method(data)` to get the embedding

    def __train(self, embedding: List[tuple]):
        """
        Train a softmax classifier based on the embedding in train data
        """
        self.X, self.y = self.__decode(embedding)
        self.__model = LogisticRegression(multi_class="multinomial", solver="lbfgs").fit(self.X, self.y)

    def __decode(self, embedding: List[tuple]) -> tuple:
        X, y = map(list, zip(*embedding))
        X, y =  map(np.array, [X, y])
        return X, y

    def predict(self, test_data: List[Dict]) -> List:
        """
        Return the pred value of y based on the trained model and given test data
        """
        test_embedding = self.__method(test_data, "label")
        X_test, y_test = self.__decode(test_embedding)
        self.__ytest = y_test
        self.__ypred = self.__model.predict(X_test)
        return self.__ypred

    def accuracy(self) -> float:
        return accuracy_score(self.__ytest, self.__ypred)
    
methods = {
    "Use the average of embeddings of all words in each document": average_all_words, 
    "Use the first paragraph’s words and take an average on these embeddings": average_first_para_words, 
    "Use the doc2vec algorithm to present each document": doc2vec_model_embeddings
}

for text, method in methods.items():
    softmax_model = softmaxModel(train_data_list, method)
    softmax_model.predict(test_data_list)
    print("Model: {:>71}, accuracy: {:>.6f}".format(text, softmax_model.accuracy()))
    del softmax_model


Model:             Use the average of embeddings of all words in each document, accuracy: 0.930000
Model: Use the first paragraph’s words and take an average on these embeddings, accuracy: 0.940000
Model:                      Use the doc2vec algorithm to present each document, accuracy: 0.920000


### Task 5 Use t-SNE to project doc vectors

> Use t-SNE to project training document embeddings into 2d and plot them out for each of the above choices. Each point should have a specific color (represent a particular cluster). You may need to try different parameters of t-SNE. One can find more details about t-SNE in this [excellent article](https://distill.pub/2016/misread-tsne/).

In [None]:
# Your code


