In [25]:
"""Compare token/document vectors for classification.
Group members: Michael (Zeyu) Li (zl310) and Chengyang Zhou (cz169)"""
import random
from typing import List, Mapping, Optional, Sequence, Tuple
# import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('gutenberg')
import numpy as np
from numpy.typing import NDArray
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD

FloatArray = NDArray[np.float64]

# Un-comment this to fix the random seed
random.seed(31)

austen = nltk.corpus.gutenberg.sents("austen-sense.txt")
carroll = nltk.corpus.gutenberg.sents("carroll-alice.txt")

vocabulary = sorted(
    set(token for sentence in austen + carroll for token in sentence)
) + [None]
vocabulary_map = {token: idx for idx, token in enumerate(vocabulary)}


def onehot(
    vocab_map: Mapping[Optional[str], int], token: Optional[str]
) -> FloatArray:
    """Generate the one-hot encoding for the provided token in the provided vocabulary."""
    embedding = np.zeros((len(vocab_map),))
    idx = vocab_map.get(token, len(vocab_map) - 1)
    embedding[idx] = 1
    return embedding


def sum_token_embeddings(
    token_embeddings: Sequence[FloatArray],
) -> FloatArray:
    """Sum the token embeddings."""
    assert len(token_embeddings) > 0, "len(input) == 0 in sum_token_embeddings"
    total: FloatArray = np.array(token_embeddings).sum(axis=0)
    return total


def split_train_test(
    X: FloatArray, y: FloatArray, test_percent: float = 10
) -> Tuple[FloatArray, FloatArray, FloatArray, FloatArray]:
    """Split data into training and testing sets."""
    N = len(y)
    data_idx = list(range(N))
    random.shuffle(data_idx)
    break_idx = round(test_percent / 100 * N)
    training_idx = data_idx[break_idx:]
    testing_idx = data_idx[:break_idx]
    X_train = X[training_idx, :]
    y_train = y[training_idx]
    X_test = X[testing_idx, :]
    y_test = y[testing_idx]
    return X_train, y_train, X_test, y_test


def generate_data_token_counts(
    h0_documents: List[List[str]], h1_documents: List[List[str]]
) -> Tuple[FloatArray, FloatArray, FloatArray, FloatArray]:
    """Generate training and testing data with raw token counts."""
    X: FloatArray = np.array(
        [
            sum_token_embeddings(
                [onehot(vocabulary_map, token) for token in sentence]
            )
            for sentence in h0_documents
        ]
        + [
            sum_token_embeddings(
                [onehot(vocabulary_map, token) for token in sentence]
            )
            for sentence in h1_documents
        ]
    )
    y: FloatArray = np.array(
        [0 for sentence in h0_documents] + [1 for sentence in h1_documents]
    )
    # print(X.shape, y.shape, len(h1_documents) + len(h0_documents))
    return split_train_test(X, y)


def generate_data_tfidf(
    h0_documents: List[List[str]], h1_documents: List[List[str]]
) -> Tuple[FloatArray, FloatArray, FloatArray, FloatArray]:
    """Generate training and testing data with TF-IDF scaling."""
    X_train, y_train, X_test, y_test = generate_data_token_counts(
        h0_documents, h1_documents
    )
    tfidf = TfidfTransformer(norm=None).fit(X_train)
    X_train = tfidf.transform(X_train)
    X_test = tfidf.transform(X_test)
    return X_train, y_train, X_test, y_test


def apply_svd(xtrain, xtest, dimens=500):
    t_svd = TruncatedSVD(dimens)  # plot the singular values - how much info do we lose?
    t_svd.fit(xtrain)
    xtrain = t_svd.transform(xtrain)
    xtest = t_svd.transform(xtest)
    return xtrain, xtest


[nltk_data] Downloading package punkt to /home/zl310/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /home/zl310/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [22]:
X_train, y_train, X_test, y_test = generate_data_tfidf(austen, carroll)

In [26]:
# print(X_train.shape)
X_train_down, X_test_down = apply_svd(X_train, X_test, dimens=450)

clf = GaussianNB().fit(X_train, y_train)
print("tfidf (train, GaussianNB):", clf.score(X_train_down, y_train))
print("tfidf (test, GaussianNB):", clf.score(X_test_down, y_test))

TypeError: fit() got an unexpected keyword argument 'dimens'