In [1]:
"""A tiny word2vec model for learning."""

'A tiny word2vec model for learning.'

In [2]:
import collections 
import math 
import random 
from nlp import Vocab

In [3]:
with open("../data/ptb/ptb.train.txt") as f:
    raw_text = f.read()
sentences = [line.split() for line in raw_text.split('\n')]

In [4]:
vocab = Vocab(sentences, min_freq=10)
len(vocab)

6719

## Subsampling

In [5]:
T = 1e-4

In [6]:
def discard_probability(t: float, freq: int, num_tokens: int) -> float:
    """
    Calculates the probability for this word to be discarded. 

    Parameters
    ----------
    t : float
        Hyperparameter to adjust for subsampling.
    freq : int
        Frequency of the word in the corpus.
    num_tokens : int
        Total number of tokens in the corpus.

    Returns
    -------
    float
        The probability for discarding this word.
    """
    return max(1 - math.sqrt(t / (freq / num_tokens)), 0)

def keep(prob: float) -> bool:
    """
    Returns True if this word is kept under the roll of a imaginary dice.

    Parameters
    ----------
    prob : float
        Probability for keeping

    Returns
    -------
    bool
        To keep or not to keep :)
    """
    rand = random.uniform(0, 1)
    return rand < prob

def subsample(sentences: list[list[str]], unk: str) -> tuple[list[list[str]], collections.Counter]:
    """
    Subsamples the words in the vocabulary according to their frequencies.

    Parameters
    ----------
    words : list[list[str]]
        All the words in the corpus
    unk : str
        The <unk> token in this case

    Returns
    -------
    tuple[list[list[str]], collections.Counter]
        The subsampled words and the counter
    """
    sentences = [[token for token in line if vocab[token] != unk] for line in sentences]
    counter = collections.Counter([token for line in sentences for token in line])
    num_tokens = sum(counter.values())
    subsampled_sentences = [[token for token in line if keep(discard_probability(T, counter[token], num_tokens))] for line in sentences]
    return subsampled_sentences, counter

In [7]:
subsampled, counter = subsample(sentences, vocab)