# 1. TF-IDF + Logistic Regression

In [1]:
from typing import List, Tuple                  # type hints for clarity when reading function signatures
import re                                       # regular expressions for custom tokenization
import numpy as np                               # numerical arrays and vectorized operations

# scikit-learn: feature extraction and modeling pieces
from sklearn.feature_extraction.text import TfidfVectorizer  # builds sparse TF-IDF features [N, V]
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS  # built-in English stopword set
from sklearn.linear_model import LogisticRegression           # linear classifier baseline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # evaluation metrics
from sklearn.model_selection import train_test_split          # stratified train/valid split

In [2]:
def tokenize(text: str) -> List[str]:
    """
    Tokenize a single string into lowercase alphabetic tokens and drop English stopwords.

    Inputs
    ------
    text : str
        Raw input string (scalar).

    Outputs
    -------
    tokens : List[str]
        Tokens after lowercasing and stopword removal. Length = #matched tokens (no tensor shape).

    Purpose
    -------
    Custom tokenizer for TF-IDF to control normalization and stopword removal.
    """
    toks = re.findall(r"[a-z]+", text.lower())
    toks = [t for t in toks if t not in ENGLISH_STOP_WORDS]
    return toks

In [8]:
def build_tfidf(texts: List[str]) -> Tuple[TfidfVectorizer, "scipy.sparse.csr_matrix"]:
    """
    Fit a TF-IDF (Term Frequency–Inverse Document Frequency) vectorizer and transform texts.

    Inputs
    ------
    texts : List[str]
        N documents; Python list length = N.

    Outputs
    -------
    vectorizer : TfidfVectorizer
        Fitted vectorizer holding vocabulary V and IDF weights.
    X : scipy.sparse.csr_matrix
        Sparse TF-IDF matrix with shape [N, V].

    Purpose
    -------
    Creates and fits a TF-IDF vectorizer using our custom tokenizer, then transforms the texts.
    """
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize,      # use our custom tokenizer defined above
        preprocessor=None,       # we handle preprocessing inside tokenize()
        token_pattern=None,      # disable the default token pattern since we provide tokenizer
        lowercase=True,          # redundant but harmless; tokenize() already lowercases
        ngram_range=(1, 1),      # unigrams only for this baseline
        sublinear_tf=True        # log-scale term frequencies helps with heavy-tailed counts
    )
    
    X = vectorizer.fit_transform(texts)
    return vectorizer, X

In [11]:
def train_logreg(X, y) -> LogisticRegression:
    """
    Train a Logistic Regression classifier on TF-IDF features.

    Inputs
    ------
    X : scipy.sparse.csr_matrix
        Feature matrix with shape [N, V].
    y : np.ndarray
        Integer labels with shape [N] (values in {0, 1, ..., C-1}).

    Outputs
    -------
    model : LogisticRegression
        Fitted scikit-learn Logistic Regression model.

    Purpose
    -------
    Learn a linear decision boundary on top of sparse TF-IDF features.
    """
    
    model = LogisticRegression(max_iter=2000)
    model.fit(X, y)
    return model

In [13]:
def evaluate(model, X, y) -> Tuple[float, str, np.ndarray]:
    """
    Evaluate classifier: accuracy, per-class report, and confusion matrix.

    Inputs
    ------
    model : LogisticRegression
        Trained classifier.
    X : scipy.sparse.csr_matrix
        Feature matrix with shape [N, V].
    y : np.ndarray
        True labels with shape [N].

    Outputs
    -------
    acc : float
        Accuracy in [0, 1].
    report : str
        Text classification report (precision/recall/F1 per class).
    cm : np.ndarray
        Confusion matrix with shape [C, C].

    Purpose
    -------
    Standard validation routine for classification.
    """

    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred)
    cm = confusion_matrix(y, y_pred)
    return acc, report, cm

In [18]:
def predict_texts(model, vectorizer, texts) -> Tuple[np.ndarray, np.ndarray]:
    """
    Predict class IDs and probabilities for raw texts.

    Inputs
    ------
    model : LogisticRegression
        Trained classifier.
    vectorizer : TfidfVectorizer
        Fitted vectorizer to transform raw inputs the same way as training.
    texts : List[str]
        M new documents.

    Outputs
    -------
    pred : np.ndarray
        Predicted class IDs, shape [M].
    proba : np.ndarray
        Predicted probabilities, shape [M, C].

    Purpose
    -------
    One-stop prediction helper for new raw texts.
    """

    X_new = vectorizer.transform(texts)
    proba = model.predict_proba(X_new)
    pred = np.argmax(proba, axis=1)
    return pred, proba

In [16]:
corpus = [
    "This movie is great and excellent",
    "Fantastic film with wonderful direction",
    "Good plot and amazing soundtrack",
    "Touching story with strong performances",
    "Brilliant engaging narrative overall",
    "This movie is bad and the pacing is awful",
    "The film is boring with dull characters",
    "Terrible editing and horrible dialogue",
    "A predictable script with poor scenes",
    "Unwatchable messy scenes and weak plot",
]
labels = np.array([1,1,1,1,1, 0,0,0,0,0], dtype=np.int64)  # 1=positive, 0=negative


X_train_txt, X_valid_txt, y_train, y_valid = train_test_split(corpus, labels, test_size=0.3, random_state=42, stratify=labels)

tfv, X_tr = build_tfidf(X_train_txt) #shape [N_train, V]
X_va = tfv.transform(X_valid_txt) #shape [N_valid, V]

clf = train_logreg(X_tr, y_train)
acc, rep, cm = evaluate(clf, X_va, y_valid)
print(f"[Practice1] Valid accuracy = {acc:.3f}")
print("[Practice1] Classification report:\n", rep)
print("[Practice1] Confusion matrix:\n", cm)

[Practice1] Valid accuracy = 0.333
[Practice1] Classification report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3

[Practice1] Confusion matrix:
 [[0 2]
 [0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
test_texts = [
    "this film is wonderful and touching",
    "awful boring movie with dull characters"
]
pred, proba = predict_texts(clf, tfv, test_texts)
print("[Practice1] Predictions:", pred)
print("[Practice1] Probabilities:\n", np.round(proba, 3))

[Practice1] Predictions: [1 1]
[Practice1] Probabilities:
 [[0.378 0.622]
 [0.492 0.508]]


# 2. Fusion: DocEmb = TF-IDF x Word2Vec (Gensim)

In [20]:
from typing import List, Tuple                       # type hints
import re                                            # regular expressions for tokenization
import numpy as np                                   # numerical arrays and linear algebra

from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF features (sparse)
from sklearn.metrics import accuracy_score                    # basic evaluation metric
from sklearn.linear_model import LogisticRegression           # linear classifier baseline

try:
    from gensim.models import Word2Vec             # Gensim's Word2Vec trainer (CBOW/Skip-gram with Negative Sampling)
except Exception as e:
    Word2Vec = None
    print("[Practice2] gensim not available; please `pip install gensim` to run this section.")


In [21]:
def tokenize(text: str) -> List[str]:
    """
    Tokenize a single string into lowercase alphabetic tokens.

    Inputs
    ------
    text : str
        Raw input string.

    Outputs
    -------
    tokens : List[str]
        List of tokens (variable-length list).

    Purpose
    -------
    Consistent tokenizer used by both TF-IDF and Word2Vec training.
    """
    return re.findall(r"[a-z]+", text.lower())

In [22]:
def train_w2v(sentences: List[List[str]],
              vector_size: int = 100,
              window: int = 5,
              min_count: int = 1,
              sg: int = 1,
              negative: int = 5,
              epochs: int = 20):
    """
    Train a Word2Vec (word-to-vector) model with Gensim.

    Inputs
    ------
    sentences : List[List[str]]
        Tokenized corpus (outer length = N sentences; inner lists have variable lengths).
    vector_size : int
        Embedding (vector) dimension d.
    window : int
        Context window size for co-occurrence.
    min_count : int
        Low-frequency cutoff; words with freq < min_count are dropped.
    sg : int
        1 = Skip-gram (center predicts context), 0 = CBOW (context predicts center).
    negative : int
        Number of negative samples per positive pair (Negative Sampling).
    epochs : int
        Training epochs over the corpus.

    Outputs
    -------
    model : gensim.models.Word2Vec
        Trained Word2Vec model (vectors accessible by model.wv).

    Purpose
    -------
    Learn dense word embeddings that capture distributional semantics.
    """

    if Word2Vec is None:
        raise RuntimeError("gensim is not installed.")

    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=sg,
        negative=negative,
        hs=0, #use negative sampling (hierarchical softmax off)
        sample=1e-3,
        workers=2,
        epochs=epochs,
        seed=42
    )
    return model

In [23]:
def docemb_from_tfidf_w2v(tfv: TfidfVectorizer, X: "scipy.sparse.csr_matrix", wv_tokens: List[str], wv_matrix: np.ndarray) -> np.ndarray:
    """
    Build dense document embeddings as: DocEmb = TF-IDF × Embedding.

    Inputs
    ------
    tfv : TfidfVectorizer
        Fitted TF-IDF vectorizer, holding token->column mapping.
    X : scipy.sparse.csr_matrix
        TF-IDF feature matrix with shape [N, V_tfv].
    wv_tokens : List[str]
        Word2Vec vocabulary tokens aligned to rows in wv_matrix; length = V_w2v.
    wv_matrix : np.ndarray
        Word2Vec embedding matrix with shape [V_w2v, d].

    Outputs
    -------
    DocEmb : np.ndarray
        Dense document embeddings with shape [N, d].

    Purpose
    -------
    Align the TF-IDF vocabulary with the Word2Vec vocabulary and compute a TF-IDF
    weighted average of word vectors via matrix multiplication.
    """

    tfv_vocab = tfv.vocabulary_                            #dict: token -> column
    wv_index = {w: i for i, w in enumerate(wv_tokens)}     #dict: token -> row

    common = sorted(set(tfv_vocab.keys()) & set(wv_index.keys()))
    if not common:
        raise ValueError("No overlapping tokens between TF-IDF vocab and Word2Vec vocab")

    cols = np.array([tfv_vocab[t] for t in common], dtype=int)
    rows = np.array([wv_index[t] for t in common], dtype=int)
    X_sub = X[:, cols] #[N, C]
    W_sub = wv_matrix[rows, :] #[C, d]

    DocEmb = X_sub @ W_sub
    DocEmb = np.asarray(DocEmb)

    norms = np.linalg.norm(DocEmb, axis=1, keepdims=True) + 1e-9
    DocEmb = DocEmb / norms
    return DocEmb

In [24]:
corpus = [
    "This movie is great and excellent",
    "Fantastic film with wonderful direction",
    "Good plot and amazing soundtrack",
    "Touching story with strong performances",
    "Brilliant engaging narrative overall",
    "This movie is bad and the pacing is awful",
    "The film is boring with dull characters",
    "Terrible editing and horrible dialogue",
    "A predictable script with poor scenes",
    "Unwatchable messy scenes and weak plot",
]
labels = np.array([1,1,1,1,1, 0,0,0,0,0], dtype=np.int64)

sentences = [tokenize(s) for s in corpus]
w2v = train_w2v(sentences, vector_size=100, window=5, min_count=1, sg=1, negative=5, epochs=20)
wv = w2v.wv
wv_tokens = list(wv.key_to_index.keys())
wv_matrix = wv.vectors

In [26]:
tfv = TfidfVectorizer(tokenizer=tokenize, preprocessor=None, token_pattern=None, lowercase=True, ngram_range=(1,1), sublinear_tf=True)
X = tfv.fit_transform(corpus)

DocEmb = docemb_from_tfidf_w2v(tfv, X, wv_tokens, wv_matrix)

In [27]:
clf = LogisticRegression(max_iter=2000)
clf.fit(DocEmb, labels)
pred = clf.predict(DocEmb)
acc = accuracy_score(labels, pred)
print(f"[Practice2] Accuracy on DocEmb (TF-IDF × Word2Vec) = {acc:.3f}")

[Practice2] Accuracy on DocEmb (TF-IDF × Word2Vec) = 1.000


In [28]:
test_texts = [
    "this film is wonderful and touching",
    "awful boring movie with dull characters"
]
X_test = tfv.transform(test_texts)                              # [M, V_tfv]
DocEmb_test = docemb_from_tfidf_w2v(tfv, X_test, wv_tokens, wv_matrix)  # [M, d]
pred_test = clf.predict(DocEmb_test)
print("[Practice2] Predictions:", pred_test)

[Practice2] Predictions: [1 0]
