In [1]:
from sklearn.datasets import fetch_openml
from sklearn.utils import Bunch
import random

# Choose a dataset: IMDB, SST-2, or AG News
dataset_name = "AG_News"  # Change this to "SST-2" or "AG News" as needed

if dataset_name:
    dataset = fetch_openml("AG_News", version=1)
else:
    raise ValueError("Unsupported dataset. Choose IMDB, SST-2, or AG News.")

train_split = 0.8

# Create a small dev subset for quick iteration
def create_dev_subset(dataset: 'Bunch', subset_size=100):
    rows_df = dataset.data

    train_df = rows_df[:int(len(rows_df) * train_split)]
    indices = random.sample(range(len(train_df)), subset_size)
    dev_subset = train_df.iloc[indices]
    return dev_subset

dev_subset = create_dev_subset(dataset)

In [2]:
import re

def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Basic unicode cleanup
    text = text.encode('utf-8', 'ignore').decode('utf-8')
    return text

dev_subset.Description = dev_subset.Description.apply(normalize_text)
dev_subset.Title = dev_subset.Title.apply(normalize_text)

In [8]:
import nltk

# Download NLTK tokenizer resources
nltk.download('punkt')
nltk.download('punkt_tab')

tokens = nltk.word_tokenize("This is a sample sentence for tokenization.")

[nltk_data] Downloading package punkt to /home/manish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/manish/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
import spacy

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

In [14]:
# Tokenize: compare whitespace split, NLTK tokenizers, and spaCy tokenization

def tokenize_text(text):
    return nltk.word_tokenize(text)

dev_subset['Title_Tokens'] = dev_subset.Title.apply(tokenize_text)
dev_subset['Description_Tokens'] = dev_subset.Description.apply(tokenize_text)


In [None]:
from nltk.stem import PorterStemmer
from spacy.lang.en.stop_words import STOP_WORDS

# stemming (Porter) vs lemmatization (spaCy), stopword removal.

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Apply stemming and lemmatization, and remove stopwords
def process_tokens(tokens):
    # Remove stopwords
    tokens = [token for token in tokens if token.lower() not in STOP_WORDS]
    # Apply stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Apply lemmatization using spaCy
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    return stemmed_tokens, lemmatized_tokens

dev_subset['Title_Processed'] = dev_subset.Title_Tokens.apply(process_tokens)
dev_subset['Description_Processed'] = dev_subset.Description_Tokens.apply(process_tokens)

# Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#    - Build `CountVectorizer` and `TfidfVectorizer` (unigram and unigram+bigram).
#    - Use `max_features` to cap vocabulary size for faster PyTorch models (e.g., 10k–50k).
#    - TF‑IDF output is typically a sparse matrix (CSR). For small dev experiments you can convert to dense with `.toarray()`; for larger data keep it sparse and sample mini-batches.

# Cap vocabulary for faster experiments
max_features = 10_000

def build_vectorizers(text_series, prefix):
    results = {}
    for VecClass, kind in ((CountVectorizer, "count"), (TfidfVectorizer, "tfidf")):
        for ngram_range in ((1, 1), (1, 2)):
            key = f"{prefix}_{kind}_{ngram_range[0]}_{ngram_range[1]}"
            vec = VecClass(ngram_range=ngram_range, max_features=max_features)
            X = vec.fit_transform(text_series)
            # For small dev experiments convert to dense for easier inspection
            if len(text_series) <= 1000:
                X_out = X.toarray()
            else:
                X_out = X
            results[key] = {"vectorizer": vec, "matrix": X_out}
    return results

# Build vectorizers for Title and Description
title_features = build_vectorizers(dev_subset.Title, "title")
description_features = build_vectorizers(dev_subset.Description, "description")

# Quick summary of resulting matrices / vocab sizes
for d in (title_features, description_features):
    for name, info in d.items():
        mat = info["matrix"]
        vocab_size = len(info["vectorizer"].vocabulary_)
        print(f"{name}: matrix shape = {mat.shape}, vocab_size = {vocab_size}")