# Load Data

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "misc.forsale",
    "rec.autos",
    "sci.space",
    "talk.religion.misc",
]
print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

Loading 20 newsgroups training data
3803 documents - 6.245MB


In [13]:
raw_data[:5]

['Subject: Re: Christian Daemons? [Biblical Demons, the u\nFrom: stigaard@mhd.moorhead.msus.edu\nReply-To: stigaard@mhd.moorhead.msus.edu\nOrganization: Moorhead State University, Moorhead, MN\nNntp-Posting-Host: 134.29.97.2\nLines: 23\n\n>>>667\n>>>the neighbor of the beast\n>>\n>>No, 667 is across the street from the beast.  664 and 668 are the\n>>neighbors of the beast.\n>\n>I think some people are still not clear on this:\n>667 is *not* the neighbor of the beast, but, rather, across the\n>street. It is, in fact, 668 which is the neighbor of the beast.\n\nno, sheesh, didn\'t you know 666 is the beast\'s apartment?  667 is across the\nhall from the beast, and is his neighbor along with the rest of the 6th floor.\n\n>Justin (still trying to figure out what this has to do with alt.discordia)\n\nThis doesn\'t seem discordant to you?\n\n-----------------------     ----------------------     -----------------------\n\t-Paul W. Stigaard, Lokean Discordian Libertarian\n  !XOA!\t\tinternet: 

# Define preprocessing functions

In [15]:
import re

In [16]:
def tokenize(doc): 
    """Extract tokens from doc. 

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or TfidfVectorizer.
    """

    return [tok.lower() for tok in re.findall(r"\w+", doc)]

list(tokenize("This is a simple example, isn't it?"))

['this', 'is', 'a', 'simple', 'example', 'isn', 't', 'it']

In [17]:
from collections import defaultdict

In [19]:
def token_freqs(doc): 
    """Extract a dict mapping tokens from doc to their occurrences.""" 

    freq = defaultdict(int)
    for tok in tokenize(doc): 
        freq[tok] += 1
    return freq

token_freqs("That is one example, but this is another")

defaultdict(int,
            {'that': 1,
             'is': 2,
             'one': 1,
             'example': 1,
             'but': 1,
             'this': 1,
             'another': 1})

# DictVectorizer

In [20]:
from time import time
from sklearn.feature_extraction import DictVectorizer

In [23]:
# track the performance of the different vectorizers
dict_count_vectorizers = defaultdict(list)

t0 = time() 
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0 

dict_count_vectorizers["vectorizer"].append(
    vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f}s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

done in 1.141s at 5.5 MB/s
Found 47928 unique terms


In [27]:
type(vectorizer.vocabulary_)

dict

In [42]:
list(vectorizer.vocabulary_.items())[-1]

('appease', 8590)

In [26]:
len(vectorizer.vocabulary_)

47928