# 0. Set up

In [1]:
import re
import math
import numpy as np
from collections import Counter, defaultdict

In [2]:
corpus = [
    "this movie is great and the acting is excellent",
    "what a fantastic film with wonderful direction",
    "the plot is good and the soundtrack is amazing",
    "the story is touching and performances are strong",
    "a brilliant and engaging narrative overall",

    "this movie is bad and the pacing is awful",
    "the film is boring with dull characters",
    "terrible editing and horrible dialogue",
    "a predictable script and poor scenes",
    "unwatchable messy scenes and weak plot",
]

In [3]:
labels = np.array([1,1,1,1,1,0,0,0,0,0], dtype=np.int64)

In [4]:
def tokenize(s):
    return re.findall(r"[a-z]+", s.lower())

In [5]:
sentences = [tokenize(line) for line in corpus]
print("[Info] Example tokenized sentence:", sentences[0])

[Info] Example tokenized sentence: ['this', 'movie', 'is', 'great', 'and', 'the', 'acting', 'is', 'excellent']


# 1. Numpy: PPMI + SVD for word vectors (Positive Pointwise Mutual Information)

In [6]:
def build_vocab_from_sentences(sentences, min_count=1):
    freq = Counter(w for sent in sentences for w in sent)
    tokens = [w for w, c in freq.items() if c >= min_count]
    tokens.sort()
    vocab = {w : i for i, w in enumerate(tokens)}
    ivocab = {i : w for w, i in vocab.items()}
    return vocab, ivocab, freq

In [13]:
def build_cooc_matrix(sentences, vocab, window=5):
    """
    build co-occurrence matrix, which shape is (V, V) where C[i, j] is word i and word j co-occurence number of times in window
    """
    V = len(vocab)
    C = np.zeros((V, V), dtype=np.float64)
    for sent in sentences:
        idxs = [vocab[w] for w in sent if w in vocab]
        for i, ci in enumerate(idxs):
            #can use random window, here use fixed window
            left = max(0, i - window)
            right = min(len(idxs), i + window + 1)
            for j in range(left, right):
                if j == i:
                    continue
                cj = idxs[j]
                C[ci, cj] += 1.0


    return C

In [8]:
def compute_ppmi(C):
    """ PPMI_{ij} = max(0, log( (C_ij * N) / (C_i* * C_*j) )) """
    total = C.sum()
    row = C.sum(1, keepdims=True)
    col = C.sum(0, keepdims=True)

    denom = (row @ col)
    with np.errstate(divide="ignore"):
        PMI = np.log((C * total + 1e-12) / (denom + 1e-12))
    PPMI = np.maximum(PMI, 0.0)
    return PPMI

In [9]:
def svd_embeddings(PPMI, dim=100, use_sqrt=True):
    """
    do SVD to PPMI: PPMI = U S V.T
    take U * S^{1/2} as word vectors

    PPMI: (V, V)
    U: (V, V)
    S: (V, )
    Vt: (V, V)
    W: (V, dim)
    """
    U, S, Vt = np.linalg.svd(PPMI, full_matrices=False)
    if dim < U.shape[1]:
        U, S = U[:, :dim], S[:dim]
    if use_sqrt:
        W = U * np.sqrt(S)[None, :]
    else:
        W = U * S[None, :]
    return W

In [10]:
def cosine_sim(a, b, eps=1e-9):
    return float(np.dot(a, b) / (np.linalg.norm(a) + eps) / (np.linalg.norm(b) + eps))

In [11]:
def most_similar_from_matrix(W, ivocab, query, vocab, topn=5):
    if query not in vocab:
        return f"{query} OOV"
    qv = W[vocab[query]]
    sims = (W @ qv) / (np.linalg.norm(W, axis=1) * np.linalg.norm(qv) + 1e-9)
    order = np.argsort(-sims)
    out = []
    for i in order:
        w = ivocab[i]
        if w == query:
            continue
        out.append((w, float(sims[i])))
        if len(out) >= topn:
            break
    return out

In [14]:
vocab, ivocab, freq = build_vocab_from_sentences(sentences, min_count=1)
C = build_cooc_matrix(sentences, vocab, window=5)
PPMI = compute_ppmi(C)
W_ppmi = svd_embeddings(PPMI, dim=100, use_sqrt=True)

In [15]:
print(f"[PPMI+SVD] Vocab={len(vocab)}, Embedding shape={W_ppmi.shape}")
print("[PPMI+SVD] most similar to 'great':", most_similar_from_matrix(W_ppmi, ivocab, "great", vocab, topn=5))

[PPMI+SVD] Vocab=45, Embedding shape=(45, 45)
[PPMI+SVD] most similar to 'great': [('acting', 0.34596382230724165), ('the', 0.30619822201217717), ('movie', 0.25211923185218205), ('and', 0.2197068643204385), ('is', 0.20462559347433978)]


# 2. Gensim: Word2Vec (CBOW/Skip-gram + Negative Sampling/HS)

In [17]:
try:
    from gensim.models import Word2Vec
    w2v = Word2Vec(
        sentences=sentences,
        vector_size=100,
        window=5,
        min_count=1,
        sg=1, # 1=skip-gram, 0=CBOW
        negative=5, #number of negative sampling
        hs=0, #hierarchical sofmaxt 
        sample=1e-3,
        workers=2,
        epochs=10,
        seed=42
    )
    wv = w2v.wv #KeyedVectors：contians token->vector
    print("[Gensim] Vocab size =", len(wv))
    if "excellent" in wv:
        print("[Gensim] most similar to 'excellent':", wv.most_similar("excellent", topn=5))
except Exception as e:
    print("[Gensim] Not installed or error:", repr(e))
    w2v, wv = None, None

[Gensim] Vocab size = 45
[Gensim] most similar to 'excellent': [('predictable', 0.14087845385074615), ('wonderful', 0.11117531359195709), ('a', 0.1084885522723198), ('messy', 0.08125098049640656), ('script', 0.08116475492715836)]


# 3. DocEmb = TF-IDF x Embedding + Logistic regression

In [22]:
def doc_emb_by_tfidf_times_embedding(texts, labels, embed_tokens, embed_matrix, topk_sim_demo=True):
    """
    texts: List[str]
    embed_tokens: List[str] 与 embed_matrix 的行严格对齐
    embed_matrix: np.ndarray, shape [V_e, d]
    """
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import LinearSVC
        from sklearn.metrics import accuracy_score

        #1) calculate TF-IDF
        tfv = TfidfVectorizer(lowercase=True, token_pattern=r"[A-Za-z]+", stop_words="english", ngram_range=(1,1), sublinear_tf=True)
        X = tfv.fit_transform(texts) #[N, V_tfv]
        vocab_tfv = tfv.vocabulary_ #str -> col index

        #2) match vocab: only words that common
        embed_index = {w:i for i, w in enumerate(embed_tokens)}
        common = sorted(set(vocab_tfv.keys()) & set(embed_index.keys()))
        if len(common) == 0:
            print("[DocEmb] No common tokens between TF-IDF vocab and embedding vocab.")
            return

        cols = np.array([vocab_tfv[w] for w in common], dtype=int)
        rows = np.array([embed_index[w] for w in common], dtype=int)
        X_sub = X[:, cols] #[N, |C|]
        W_sub = embed_matrix[rows, :]  #[|C|, d]

        #3). DocEmb = TF-IDF x Embedding
        DocEmb = X_sub @ W_sub #[N,d]
        DocEmb = np.asarray(DocEmb)

        #4). train and eval
        #TF-IDF x Embedding
        clf_lr = LogisticRegression(max_iter=1000)
        clf_lr.fit(DocEmb, labels)
        pred_lr = clf_lr.predict(DocEmb)
        acc_lr = accuracy_score(labels, pred_lr)

        # only TF-IDF 
        clf_lr2 = LogisticRegression(max_iter=1000)
        clf_lr2.fit(X, labels)
        pred_lr2 = clf_lr2.predict(X)
        acc_lr2 = accuracy_score(labels, pred_lr2)


        print(f"[DocEmb] LogisticRegression on DocEmb(TF-IDF×Embedding) acc={acc_lr:.3f}  vs  pure TF-IDF acc={acc_lr2:.3f}")

    except Exception as e:
        print("[DocEmb] Need scikit-learn installed:", repr(e))

In [23]:
# —— PPMI+SVD -> DocEmb ——
embed_tokens_ppmi = [ivocab[i] for i in range(len(ivocab))]
embed_matrix_ppmi = W_ppmi
doc_emb_by_tfidf_times_embedding(corpus, labels, embed_tokens_ppmi, embed_matrix_ppmi)

# —— Gensim Word2Vec -> DocEmb ——
if wv is not None:
    embed_tokens_w2v = list(wv.key_to_index.keys())    # 与 wv.vectors 行严格对齐
    embed_matrix_w2v = wv.vectors
    doc_emb_by_tfidf_times_embedding(corpus, labels, embed_tokens_w2v, embed_matrix_w2v)

[DocEmb] LogisticRegression on DocEmb(TF-IDF×Embedding) acc=1.000  vs  pure TF-IDF acc=1.000
[DocEmb] LogisticRegression on DocEmb(TF-IDF×Embedding) acc=1.000  vs  pure TF-IDF acc=1.000
