In [1]:
import os
labels = os.listdir("./dataset")    # get all labels
l2i = {labels[i]: i for i in range(len(labels))}    # label to index
texts, y = [], []
print("labels:")
print(labels)
print("labels to index")
print(l2i)

labels:
['alt.atheism', 'comp.graphics', 'rec.motorcycles', 'soc.religion.christian', 'talk.politics.misc']
labels to index
{'alt.atheism': 0, 'comp.graphics': 1, 'rec.motorcycles': 2, 'soc.religion.christian': 3, 'talk.politics.misc': 4}


In [2]:
import codecs

for l in labels:
    files = os.listdir(f"dataset/{l}")
    for f in files:
        path = f"dataset/{l}/{f}"
        # read txt file
        file = codecs.open(path, 'r', 'Latin1') # use Latin1 encoding
        content = file.read()
        # split into words
        content = content.lower().split()
        texts.append(content)
        y.append(l2i[l])    # numerical label
print("number of documents", len(texts))
print(texts[0])
print(y[0])

number of documents 2726
['from:', 'mathew', '<mathew@mantis.co.uk>', 'subject:', 'alt.atheism', 'faq:', 'atheist', 'resources', 'summary:', 'books,', 'addresses,', 'music', '--', 'anything', 'related', 'to', 'atheism', 'keywords:', 'faq,', 'atheism,', 'books,', 'music,', 'fiction,', 'addresses,', 'contacts', 'expires:', 'thu,', '29', 'apr', '1993', '11:57:19', 'gmt', 'distribution:', 'world', 'organization:', 'mantis', 'consultants,', 'cambridge.', 'uk.', 'supersedes:', '<19930301143317@mantis.co.uk>', 'lines:', '290', 'archive-name:', 'atheism/resources', 'alt-atheism-archive-name:', 'resources', 'last-modified:', '11', 'december', '1992', 'version:', '1.0', 'atheist', 'resources', 'addresses', 'of', 'atheist', 'organizations', 'usa', 'freedom', 'from', 'religion', 'foundation', 'darwin', 'fish', 'bumper', 'stickers', 'and', 'assorted', 'other', 'atheist', 'paraphernalia', 'are', 'available', 'from', 'the', 'freedom', 'from', 'religion', 'foundation', 'in', 'the', 'us.', 'write', 'to

In [3]:
import re
# load stopwords from txt file
with open(r"stopwords.txt", "r", encoding="utf-8") as fp:
    stopwords = fp.read().splitlines()
stopwords = set(stopwords)
print("number of stopwords", len(stopwords))
print("before removing stopwords:", len(texts[0]))
texts_rs = []
for text in texts:
    # remove non-alphabetic characters
    text_rs = []
    for word in text:
        word = re.sub(r"[^a-z]", " ", word).strip()
        if word != "":
            text_rs.extend(word.split())
    text_rs = [w for w in text_rs if w not in stopwords]
    texts_rs.append(text_rs)
print("after removing stopwords:", len(texts_rs[0]))
print(texts_rs[0])

number of stopwords 851
before removing stopwords: 1704
after removing stopwords: 941
['mathew', 'mathew', 'mantis', 'uk', 'subject', 'alt', 'atheism', 'faq', 'atheist', 'resources', 'summary', 'books', 'addresses', 'music', 'atheism', 'keywords', 'faq', 'atheism', 'books', 'music', 'fiction', 'addresses', 'contacts', 'expires', 'thu', 'apr', 'gmt', 'distribution', 'organization', 'mantis', 'consultants', 'cambridge', 'uk', 'supersedes', 'mantis', 'uk', 'lines', 'archive', 'atheism', 'resources', 'alt', 'atheism', 'archive', 'resources', 'modified', 'december', 'version', 'atheist', 'resources', 'addresses', 'atheist', 'organizations', 'usa', 'freedom', 'religion', 'foundation', 'darwin', 'fish', 'bumper', 'stickers', 'assorted', 'atheist', 'paraphernalia', 'freedom', 'religion', 'foundation', 'write', 'ffrf', 'box', 'madison', 'wi', 'telephone', 'evolution', 'designs', 'evolution', 'designs', 'sell', 'darwin', 'fish', 'fish', 'symbol', 'christians', 'stick', 'cars', 'feet', 'word', 'd

In [4]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")    # Snowball stemmer

texts_stem = []
for t in texts_rs:
    texts_stem.append([stemmer.stem(w) for w in t])

t1, t2 = texts_rs[0], texts_stem[0]
print("length of text:", len(t1))
print("change of stemming:", len([i for i in range(len(t1)) if t1[i] != t2[i]]))

length of text: 941
change of stemming: 456


In [5]:
import numpy as np
class TfIdfTokenizer:
    def __init__(self) -> None:
        self.document_freq = {}  # word to document frequency
        self.vocab_table = {}   # word to idx table
        self.num_docs = 0   # number of documents
        self.dims = 0   # total word num

    def update_vocab(self, docs):
        for doc in docs:
            self.num_docs += 1
            words = set(doc)    # keep unique words
            for w in words:
                # for a word, count the num of documents that contain it
                self.document_freq[w] = self.document_freq.get(w, 0) + 1
        self.vocab_table = {w: i for i, w in enumerate(
            self.document_freq.keys())}  # word to index table
        self.dims = len(self.document_freq)  # number of unique words

    def _idfs(self):
        # convert dict value to array
        doc_freq = np.array(list(self.document_freq.values()))
        # calculate idf
        return np.log(self.num_docs / (doc_freq))

    def fit(self, docs):
        self.update_vocab(docs)  # update documents info
        idfs = self._idfs()  # calculate idf
        word_freq = np.zeros((self.num_docs, self.dims)
                             )    # word frequency array

        for i, doc in enumerate(docs):
            for word in doc:
                word_freq[i, self.vocab_table[word]] += 1   # count
        # calculate frequency
        tf = word_freq / word_freq.sum(axis=1, keepdims=True)

        tf_idf = np.log(tf + 1) * idfs  # calculate tf-idf
        # normalize the representation
        tf_idf = tf_idf / np.sum(tf_idf**2, axis=1, keepdims=True)**0.5
        
        # print(np.sum(tf_idf == 0)/tf_idf.size)
        # print(self.dims)

        return tf_idf

    def __str__(self):
        return f"dims: {self.dims}, num_docs: {self.num_docs}"

In [6]:
tokenizer = TfIdfTokenizer()
Aik = tokenizer.fit(texts)
print(tokenizer)

dims: 80491, num_docs: 2726


In [7]:
print("shape of tf-idf matrix:", Aik.shape)
print("rate of 0 in matrix",np.sum(Aik == 0)/Aik.size)

shape of tf-idf matrix: (2726, 80491)
rate of 0 in matrix 0.9977799954175234


In [8]:
np.savez("tfidf.npz", X=Aik)

In [9]:
class TfIdfMaxdim(TfIdfTokenizer):
    def __init__(self, max_features=5000) -> None:
        super().__init__()
        self.max_features = max_features
        self._term_freq = {}
    
    def update_vocab(self, docs):
        for doc in docs:
            self.num_docs += 1
            words = set(doc)    # keep unique words
            for w in words:
                # for a word, count the num of documents that contain it
                self.document_freq[w] = self.document_freq.get(w, 0) + 1
                self._term_freq[w] = self._term_freq.get(w, 0) + doc.count(w)
        # sort by term frequency and select top max_features
        self.term_freq = dict(sorted(self._term_freq.items(), key=lambda x: x[1], reverse=True)[:self.max_features])
        self.document_freq = {k:self.document_freq[k] for k in self.term_freq.keys()}
        self.vocab_table = {w: i for i, w in enumerate(
            self.document_freq.keys())}  # word to index table
        self.dims = len(self.document_freq)  # number of unique words
    
    def get_term_freq(self):
        return dict(sorted(self._term_freq.items(), key=lambda x: x[1], reverse=True))
    
    def fit(self, docs):
        self.update_vocab(docs)
        idfs = self._idfs()
        word_freq = np.zeros((self.num_docs, self.dims)
                             )    # word frequency array
        for i, doc in enumerate(docs):
            for word in doc:
                dim_idx = self.vocab_table.get(word, None)  # get index of word
                if dim_idx is not None:
                    word_freq[i, dim_idx] += 1   # count
        # calculate frequency
        tf = word_freq / word_freq.sum(axis=1, keepdims=True)

        tf_idf = np.log(tf + 1) * idfs  # calculate tf-idf
        # normalize the representation
        tf_idf = tf_idf / np.sum(tf_idf**2, axis=1, keepdims=True)**0.5
        
        # print(np.sum(tf_idf == 0)/tf_idf.size)
        # print(self.dims)

        return tf_idf

In [10]:
tokenizer = TfIdfMaxdim(max_features=5000)
Aik = tokenizer.fit(texts)
print("tokenizer:", tokenizer)
print("shape of tf-idf matrix:", Aik.shape)
print("rate of 0 in matrix",np.sum(Aik == 0)/Aik.size)
np.savez("tfidf_maxdim.npz", X=Aik)

tokenizer: dims: 5000, num_docs: 2726
shape of tf-idf matrix: (2726, 5000)
rate of 0 in matrix 0.9757177549523111
