In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pkmixer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pkmixer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pkmixer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
def get_words_tags(words, tag_method='by-word'):   
    tags = []
    if tag_method == 'by-word':
        for i in range(len(words)):
            cur_tag = nltk.pos_tag([words[i]])[0][1][0]
            tags.append(tags_dict.get(cur_tag, default))
    elif tag_method == 'by-context':
        tags = nltk.pos_tag(words)
#         for i in range(len(tags)):
#             tags[i] = tags_dict.get(tags[i][1], default)
    
    
    return tags

def get_lemmatized_words(lemmatizer, words):
    tags = get_words_tags(words, tag_method='by-context')
    
    tags_dict = {"J": wordnet.ADJ,
                 "N": wordnet.NOUN,
                 "V": wordnet.VERB,
                 "R": wordnet.ADV}
    default = wordnet.NOUN
    
    lemmatized_words = []
    for i in range(len(tags)):
        lemmatized_words.append(lemmatizer.lemmatize(tags[i][0], tags_dict.get(tags[i][1], default)))

    return lemmatized_words

In [4]:
def tokenizer(text):
    text = text.translate(str.maketrans('', '', string.punctuation)).translate(str.maketrans('', '', string.digits)).lower()
    words = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    return get_lemmatized_words(lemmatizer, words)

In [5]:
import os

path = "bbc"

data = []
target = []
categories = os.listdir(path)

for cat_index, category in enumerate(categories):
    cur_path = os.path.join(path, category)
    files = os.listdir(cur_path)
    
    for file_name in files:
        file_path = os.path.join(cur_path, file_name)
        with open(file_path, encoding="utf8", errors='ignore') as file:
            data.append(file.read())
            target.append(cat_index)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

class NaiveBayes:
    def fit(self, x, y):
        self.x = x
        self.y = y

        self.vectorizer = CountVectorizer(tokenizer=tokenizer)
        self.freq = self.vectorizer.fit_transform(x)

        self.unique_categories = np.unique(y)
        self.words_in_cat = np.zeros((self.unique_categories.size, self.freq.shape[1]))
        self.num_cat = np.zeros(self.unique_categories.size)

        for category in self.unique_categories:
            self.words_in_cat[category] = np.sum(self.freq[category == y], axis=0)
            self.num_cat[category] = self.freq[category == y].shape[0]

    def predict_multinomial(self, documents):
        result = []
        for document in documents:
            tokens = tokenizer(document)
            cat_freq = np.zeros_like(self.num_cat, dtype=float)

            for category in self.unique_categories:
                cat_freq[category] = np.log(self.num_cat[category] / len(self.y))

                for i in range(len(tokens)):
                    word_freq = 0
                    idx = self.vectorizer.vocabulary_.get(tokens[i], -1)
                    if idx != -1:
                        word_freq = self.words_in_cat[category, idx]

                    cat_freq[category] += np.log((word_freq + 1) /
                                                 (np.sum(self.words_in_cat[category]) + self.freq.shape[1]))

            result.append(np.argmax(cat_freq))
        return result

    def predict_multidimensional(self, documents):
        result = []
        for document in documents:
            tokens = tokenizer(document)
            cat_freq = np.zeros_like(self.num_cat, dtype=float)

            for category in self.unique_categories:
                cat_freq[category] = np.log(self.num_cat[category] / len(self.y))
                probability = 0
                
                for i in range(len(tokens)):
                    word_freq = 0
                    idx = self.vectorizer.vocabulary_.get(tokens[i], -1)
                    if idx != -1:
                        word_freq = self.words_in_cat[category, idx]
                        probability = (word_freq + 1) / (np.sum(self.words_in_cat[category]) + 2)
                    else:
                        probability = 1 - (word_freq + 1) / (np.sum(self.words_in_cat[category]) + 2)
                    
                    cat_freq[category] += np.log(probability)

            result.append(np.argmax(cat_freq))
        return result

In [10]:
from sklearn.model_selection import train_test_split

def precision(pred, target):
    return np.count_nonzero(np.equal(pred, target)) / len(pred)

print("Documents in dataset: {}\nCategories: {}".format(len(data), categories))

train, test, target_train, target_test = train_test_split(data, target, test_size=0.7, stratify=target)
train, test, target_train, target_test = train[:], test[:], target_train[:], target_test[:]

clf = NaiveBayes()
clf.fit(train, target_train)


pred_multinomial = clf.predict_multinomial(test)
pred_multidimensinonal = clf.predict_multidimensional(test)

print("Multinoimal precision: {:.5}".format(precision(pred_multinomial, target_test)))
print("Multidimensional precision: {:.5}".format(precision(pred_multidimensinonal, target_test)))

Documents in dataset: 2225
Categories: ['business', 'entertainment', 'politics', 'sport', 'tech']
Multinoimal precision: 0.96213
Multidimensional precision: 0.96855
