In [23]:
%pip install -q requests nltk beautifulsoup4 contractions pandas numpy scikit_learn
import nltk
nltk.download('stopwords')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to /home/genos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
number_of_documents_per_category = 5
num_category = 2
documents = []

for i in tqdm(range(num_category)):
    for i in range(number_of_documents_per_category):
        url = f'https://wikipedia.org/wiki/Special:Random'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        text = soup.get_text()[2000:] # to remove repeated words
        text = text.lower()
        text = re.sub(r'\[[0-9]*\]',' ',text) 
        text = re.sub(r'\s+',' ',text)
        text = re.sub(r'[^\w\s]','',text)
        text = ' '.join([word for word in text.split() if word not in stop_words])
        documents.append(text)
documents

100%|██████████| 2/2 [00:44<00:00, 22.38s/it]


['football league third division south spent two seasons never secured regular place clubs first team moved brighton hove albion joining french club fc sète referencesedit b joyce michael 2004 football league players records 18881939 soccerdata p 240 isbn 1899468676 brown tony 2003 definitive gillingham fc complete record soccerdata pp 4142 isbn 189946820x biographical article related association football england defender born 1900s stub help wikipedia expanding itvte retrieved httpsenwikipediaorgwindexphptitleharold_slyoldid1164072454 categories 1904 birthspeople appley bridgeenglish mens footballersmens association football utility playersbirmingham city fc playerstamworth fc playersgillingham fc playersbrighton hove albion fc playersfc sète 34 playersenglish football league playersmens association football fullbacksmens association football midfieldersmens association football inside forwardsfootballers lancashireenglish football defender 1900s birth stubshidden categories articles 

In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
unigram_count = X.toarray()  # Unigram count matrix

# Bigram probability matrix
bigram = CountVectorizer(ngram_range=(1,2), analyzer='word').fit(documents)
X_bigram = bigram.transform(documents)
sum_cols = X_bigram.sum(axis=0)
bigram_prob = normalize(X_bigram, norm='l1', axis=1) 

tfidf = TfidfTransformer().fit_transform(X) # TF-IDF matrix

print('Unigram count matrix shape:', unigram_count.shape)
print('Bigram probability matrix shape:', bigram_prob.shape)
print('TF-IDF matrix shape:', tfidf.shape)

Unigram count matrix shape: (10, 1808)
Bigram probability matrix shape: (10, 4787)
TF-IDF matrix shape: (10, 1808)


In [26]:
import nltk
from nltk.util import ngrams
from collections import Counter
import numpy as np

tokenizer = nltk.RegexpTokenizer(r'\w+')
docs_tokens = [tokenizer.tokenize(doc) for doc in documents]

vocab = set()
for doc_tokens in docs_tokens:
    vocab.update(doc_tokens)
vocab = list(vocab)

# Unigram count matrix
unigram_count = [[doc_tokens.count(word) for word in vocab] for doc_tokens in docs_tokens]

# Bigram probability matrix
bigrams = [list(ngrams(doc,2)) for doc in docs_tokens]
bigrams_freq = Counter([b for doc in bigrams for b in doc])
bigrams_prob = [[bigrams_freq[b]/sum(bigrams_freq.values()) for b in set(ngrams(doc,2))] for doc in docs_tokens]

# TF-IDF matrix 
doc_freq = {word: sum([1 for doc in docs_tokens if word in doc]) for word in vocab}
idf = {word: np.log(len(documents)/doc_freq[word]) for word in vocab}
tfidf = [[count * idf[word] for word, count in Counter(doc).items()] for doc in docs_tokens]

print(unigram_count[0][:5]) 
print(bigrams_prob[0][:5])
print(tfidf[0][:5])

[0, 1, 0, 0, 0]
[0.00024449877750611245, 0.00024449877750611245, 0.00024449877750611245, 0.0024449877750611247, 0.00024449877750611245]
[20.723265836946414, 6.907755278982138, 2.302585092994046, 1.6094379124341003, 0.9162907318741551]


In [27]:
import numpy as np

class NaiveBayes:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.vocab = None
        self.classes = None
        self.prior = None
        self.likelihood = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.vocab = set()
        for doc in X:
            self.vocab.update(doc)
        self.vocab = list(self.vocab)

        n_instances = len(X)
        n_classes = len(self.classes)
        n_vocab = len(self.vocab)

        self.prior = np.zeros(n_classes)
        for c in self.classes:
            self.prior[c] = np.sum(y == c) / n_instances
        self.likelihood = np.zeros((n_classes, n_vocab))

        for c in self.classes:
            class_docs = [doc for doc, cls in zip(X, y) if cls == c]
            total_words = sum(len(doc) for doc in class_docs)
            word_counts = np.zeros(n_vocab)
            for doc in class_docs:
                for word in doc:
                    word_counts[self.vocab.index(word)] += 1
            self.likelihood[c] = (word_counts + self.alpha) / (total_words + n_vocab * self.alpha)


    def predict(self, X, use_unigram=True, use_bigram=True, use_tfidf=True):
        predictions = []
        for doc in X:
            scores = np.log(self.prior)
            unseen_class_score = np.log(1 / len(self.classes))  # Default score for unseen classes
            for word in doc:
                if use_unigram:
                    if word in self.vocab:
                        scores += np.log(self.likelihood[:, self.vocab.index(word)])
                    else:
                        scores += unseen_class_score
            if use_bigram:
                bigrams = list(ngrams(doc, 2))
                for bigram in bigrams:
                    if bigram in bigram_prob:
                        scores += np.log(bigram_prob[bigram])
                    else:
                        scores += unseen_class_score
            if use_tfidf:
                tfidf_val = np.array([tfidf[doc.index(word)] for word in doc if word in self.vocab])
                for i, score in enumerate(scores):
                    if len(tfidf_val) > 0:
                        scores[i] += np.sum(tfidf_val * np.log(
                          self.likelihood[i, [self.vocab.index(word) for word in doc if word in self.vocab]]
                        ))
                    else:
                        scores[i] += unseen_class_score * sum(1 for word in doc if word not in self.vocab)
            predictions.append(self.classes[np.argmax(scores)])
        return predictions

In [28]:
X = unigram_count 
y = [0, 1, 1, 0, 1, 1, 1, 0, 1, 1]

nb = NaiveBayes()
nb.fit(X, y)

In [29]:
nb.predict(documents, use_unigram=True, use_bigram=True, use_tfidf=False)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [30]:
nb.predict(documents, use_unigram=True, use_bigram=False, use_tfidf=False)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [31]:
nb.predict(documents, use_unigram=False, use_bigram=True, use_tfidf=False)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [32]:
nb.predict(documents, use_unigram=False, use_bigram=False, use_tfidf=True)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]