# 0. toy dataset

In [2]:
import re
import math
import random
from collections import Counter, defaultdict
import numpy as np

In [3]:
texts = [
    "I absolutely love this movie, the acting is great and the story is touching.",
    "Terrible plot and boring characters, I regret watching this film.",
    "What a fantastic experience! Beautiful soundtrack and strong performances.",
    "This is the worst movie ever; bad editing and a predictable script.",
    "Wonderful direction and engaging narrative. I would recommend it to everyone.",
    "Awful pacing. The film drags on and I nearly fell asleep.",
    "Heartwarming and inspiring – a must watch.", 
    "Unwatchable. Poor dialogue and messy scenes."
]
labels = np.array([1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)

In [5]:
rng = np.random.default_rng(42)
idx = np.arange(len(texts))
rng.shuffle(idx)
split = int(0.7 * len(texts))
train_idx, test_idx = idx[:split], idx[split:]
#texts_train = texts[train_idx]  TypeError: only integer scalar arrays can be converted to a scalar index
texts_train = [texts[i] for i in train_idx]
y_train = labels[train_idx]
texts_test = [texts[i] for i in test_idx]
y_test = labels[test_idx]

In [6]:
print(f"[Data] Train={len(texts_train)}, Test={len(texts_test)}")

[Data] Train=5, Test=3


# 1. Numpy: Tokenizer -> vocab -> BOW -> TF-IDF

In [21]:
def simple_tokenize(s, lower=True):
    """
    simple regex tokenizer (letters only)
    """
    if lower:
        s = s.lower()
    tokens = re.findall(r"[a-z]+", s)
    return tokens

In [10]:
def build_vocab(docs, min_df=1, max_vocab_size=None):
    """
    build vocabulary: {token -> index}
    """
    df_counter = Counter()
    for doc in docs: #doc is a sring, first tokenize it, and words in one sentence will only count once in df, will increase if in multiple docs
        toks = set(simple_tokenize(doc))
        for t in toks:
            df_counter[t] += 1
    items = [(tok, df) for tok, df in df_counter.items() if df >= min_df]
    items.sort(key=lambda x: (-x[1], x[0])) #df decreasing order, dictionary increase order (lexicographic)
    if max_vocab_size is not None:
        items = items[:max_vocab_size]
    vocab = {tok: i for i, (tok, _) in enumerate(items)}
    return vocab, df_counter

In [12]:
def bow_count_matrix(docs, vocab):
    """
    return shape: [n_docs, |V|]
    document-term counts: every token frequency in each docs for this vocab
    """
    n_docs = len(docs)
    V = len(vocab)
    X = np.zeros((n_docs, V), dtype=np.float32)
    for i, doc in enumerate(docs):
        toks = simple_tokenize(doc)
        cnt = Counter(toks)
        for tok, c in cnt.items():
            j = vocab.get(tok, None)
            if j is not None:
                X[i, j] = c
    return X

In [14]:
def tf_transform(counts):
    """
    intput counts is the X returned by bow_count_matrix(docs, vocab)
    TF = count / sum(counts_in_doc)
    """
    row_sums = counts.sum(axis=1, keepdims=True) + 1e12
    return counts / row_sums

In [16]:
def idf_vector(docs, vocab, smooth=True):
    """
    calculate IDF:
    if not smooth: idf = log(N / df)
    if smooth: idf = log((1 + N) / (1 + df)) + 1 sklearn style
    """
    N = len(docs)
    df = np.zeros(len(vocab), dtype=np.int32)
    inv_vocab = {j : t for t, j in vocab.items()}
    for j in range(len(vocab)):
        tok = inv_vocab[j]
        c = 0
        for doc in docs:
            if tok in set(simple_tokenize(doc)):
                c += 1
        df[j] = c
    if smooth:
        idf = np.log((1 + N) / (1 + df)) + 1.0
    else:
        idf = np.log(N / (df + 1e-12))
    return idf.astype(np.float32)

In [19]:
def tfidf_matrix(docs, vocab, smooth=True):
    counts = bow_count_matrix(docs, vocab) #shape: (N_docs, V)
    TF = tf_transform(counts) #shape: (N_docs, V)
    IDF = idf_vector(docs, vocab, smooth=smooth) #shape: (V,)
    X = TF * IDF[None, :] #shape: (N_docs, V), IDF[None,:] will make (V,) to (1, V)
    return X.astype(np.float32)

In [22]:
vocab, df_counter = build_vocab(texts_train, min_df=1, max_vocab_size=None)

In [23]:
X_train_bow = bow_count_matrix(texts_train, vocab)
X_test_bow = bow_count_matrix(texts_test, vocab)
X_train_tfidf = tfidf_matrix(texts_train, vocab, smooth=True)
X_test_tfidf = tfidf_matrix(texts_test, vocab, smooth=True)

In [25]:
print(f"[Scratch] Vocab size = {len(vocab)}")
print(f"[Scratch] BOW train shape = {X_train_bow.shape}, TF-IDF train shape = {X_train_tfidf.shape}")

[Scratch] Vocab size = 38
[Scratch] BOW train shape = (5, 38), TF-IDF train shape = (5, 38)


In [28]:
#show highest weighted TF-IDF words
mean_tfidf = X_train_tfidf.mean(axis=0)
topk = np.argsort(-mean_tfidf)[:10]
inv_vocab = {j: t for t, j in vocab.items()}
top_words = [(inv_vocab[j], float(mean_tfidf[j])) for j in topk]
print("[Scratch] Top words by mean TF-IDF:")
for w, s in top_words:
    print(f"  {w:15s}  {s:.4f}")

[Scratch] Top words by mean TF-IDF:
  and              0.0000
  a                0.0000
  performances     0.0000
  poor             0.0000
  predictable      0.0000
  recommend        0.0000
  scenes           0.0000
  script           0.0000
  soundtrack       0.0000
  strong           0.0000


In [29]:
mean_tfidf

array([1.0000000e-12, 8.4327911e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13, 4.1972247e-13, 4.1972247e-13,
       4.1972247e-13, 4.1972247e-13], dtype=float32)

# 2. Sklearn: Count & TF-IDF

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [34]:
cv = CountVectorizer(lowercase=True, token_pattern=r"[A-Za-z]+")
X_train_bow_sk = cv.fit_transform(texts_train)
X_test_bow_sk = cv.transform(texts_test)

In [35]:
tv = TfidfVectorizer(lowercase=True, token_pattern=r"[A-Za-z]+", smooth_idf=True)
X_train_tfidf_sk = tv.fit_transform(texts_train)
X_test_tfidf_sk = tv.transform(texts_test)

In [37]:
print(f"[sklearn] BOW train shape = {X_train_bow_sk.shape}, TF-IDF train shape = {X_train_tfidf_sk.shape}")

[sklearn] BOW train shape = (5, 38), TF-IDF train shape = (5, 38)


In [38]:
#simple BSL
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_tfidf_sk, y_train)
y_pred = clf.predict(X_test_tfidf_sk)
print(f"[sklearn] LogReg on TF-IDF  Test Acc = {accuracy_score(y_test, y_pred):.3f}")

[sklearn] LogReg on TF-IDF  Test Acc = 0.333


# 3. PyTorch & TensorFlow on TF-IDF Features
using the from scratch version TF-IDF for comparison

## 3.1 PyTorch

In [39]:
import torch, torch.nn as nn, torch.optim as optim

In [40]:
Xtr = X_train_tfidf.astype(np.float32)
Xte = X_test_tfidf.astype(np.float32)

In [41]:
class TorchLogReg(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.linear = nn.Linear(in_dim, 2)

    def forward(self, x):
        return self.linear(x)

device = torch.device("mps")
model = TorchLogReg(in_dim=Xtr.shape[1]).to(device)
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()

Xtr_t = torch.tensor(Xtr, dtype=torch.float32, device=device)
ytr_t = torch.tensor(y_train, dtype=torch.long, device=device)
Xte_t = torch.tensor(Xte, dtype=torch.float32, device=device)
yte_t = torch.tensor(y_test, dtype=torch.long, device=device)


epochs = 60
for ep in range(1, epochs+1):
    model.train()
    opt.zero_grad()
    logits = model(Xtr_t)
    loss = loss_fn(logits, ytr_t)
    loss.backward()
    opt.step()

    if ep % 15 == 0 or ep == 1:
        model.eval()
        with torch.no_grad():
            te_logits = model(Xte_t)
            te_pred = te_logits.argmax(dim=1)
            te_acc = (te_pred == yte_t).float().mean().item()
        print(f"[PyTorch] ep={ep:3d}  train_loss={loss.item():.4f}  test_acc={te_acc:.3f}")


[PyTorch] ep=  1  train_loss=0.6849  test_acc=0.333
[PyTorch] ep= 15  train_loss=0.6734  test_acc=0.333
[PyTorch] ep= 30  train_loss=0.6734  test_acc=0.333
[PyTorch] ep= 45  train_loss=0.6730  test_acc=0.333
[PyTorch] ep= 60  train_loss=0.6730  test_acc=0.333


## 3.2 TensorFlow

In [43]:
import tensorflow as tf
tf.random.set_seed(42)

In [45]:
tf_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(Xtr.shape[1],)),
    tf.keras.layers.Dense(2)
])
tf_model.compile(optimizer=tf.keras.optimizers.Adam(1e-2), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
hist = tf_model.fit(Xtr, y_train, validation_data=(Xte, y_test), batch_size=len(Xtr), epochs=60, verbose=0)
te_loss, te_acc = tf_model.evaluate(Xte, y_test, verbose=0)
print(f"[TensorFlow] Test Acc = {te_acc:.3f}")

2025-09-02 22:08:16.587706: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[TensorFlow] Test Acc = 0.333
