In [None]:
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11696 sha256=c9df260fd6b76eb8b9803f1aaa04ad8fb0b69acde7f45dcedb2318b7d700ca14
  Stored in directory: /root/.cache/pip/wheels/c2/46/f4/caa1bee71096d7b0cdca2f2a2af45cacf35c5760bee8f00948
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
import wikipedia as wiki
import pickle

In [None]:
domains = ['weather', 'sports']

In [None]:
weather_documents = ['Weather', 'Weather_forecasting', 'Weather modification', 'Weather Underground (weather service)', 'National Weather Service', 'AccuWeather']
sports_documents = ['Sport', 'BeINSports', 'Sports Illustrated', 'Professional sports', 'Bally Sports', 'Olympic sports']

In [None]:
def class_prob(doc_label):
  # assuming classes numbers from 0 till n
  number_of_classes = max(doc_label)+1
  classes = [0] * number_of_classes
  for label in doc_label:
    classes[label] += 1
  total_docs = len(doc_label)
  classes_probability = [c/total_docs for c in classes]
  return classes_probability

In [None]:
def preprocessing(doc):

  punctuations_numbers = '!@#$%^&*()~`,.;:/"\'[]{}\\<>?=+-_|1234567890'

  stop_words = ['a', 'an', 'the', 'then', 'else', 'where', 'when', 'how', 'many', 'much',
                'i', 'he', 'she', 'they', 'you', 'it', 'them', 'him', 'his', 'here', 'mine', 'our', 
                'ours', 'is', 'are', 'am', 'all', 'some', 'do', 'did', 'has', 'have', 'had', 'been',
                'myself', 'yours', 'can', 'could', 'nor', 'no', 'not', 'too', 'so', 'very',
                'nt', 'm', 've', 're', 'll', 's', 'd', 'yet', 'n', 't', 'at', 'from', 'those', 'be',
                'other', 'others', 'such', 'most', 'but', 'now', 'then', 'later', 'soon', 'in',
                'on', 'between', 'above', 'further', 'sooner', 'and', 'of', 'to', 'by', 'as', 'if',
                'that', 'this', 'these', 'from', 'maybe']

  preproceed_doc = ""
  # remove all punctuations and numbers by replace them with space
  for c in doc:
    if c in punctuations_numbers:
      preproceed_doc += ' '
    else:
      preproceed_doc += c

  # remove all stop words and join the rest with single space
  preproceed_doc = " ".join([x for x in preproceed_doc.split() if x.lower() not in stop_words])
  
  return preproceed_doc

In [None]:
def getTokens(doc):
  words = [x.lower() for x in doc.split()]
  tokens = list(set(words))
  return tokens

In [None]:
def conditional_prob(docs_tokens, docs_labels):
  word_conditional_probabilities = {}

  unique_labels = set(docs_labels)
  for label in unique_labels:
    word_conditional_probabilities[label] = dict()
  
  all_tokens = set()

  # count the number of times each token appear in the class
  for tokens, label in zip(docs_tokens, docs_labels):
    all_tokens.update(tokens)
    current_class = word_conditional_probabilities[label]
    for token in tokens:
      if token not in current_class:
        current_class[token] = 1
      else:
        current_class[token] += 1
  
  # find the probability of token given the class
  for token in all_tokens:
    # total number of times the token appear in the all classes 
    count = 0
    for label in unique_labels:
      if token in word_conditional_probabilities[label]:
        count += word_conditional_probabilities[label][token]
    # divide each number of times the token appear in the class by count
    for label in unique_labels:
       if token in word_conditional_probabilities[label]:
         word_conditional_probabilities[label][token] /= count
  return word_conditional_probabilities

In [None]:
train_data = []
train_labels = [0]*5 + [1]*5
test_data = []
test_labels = [0, 1]

for doc_name in weather_documents[:5]+sports_documents[:5]:
  train_data.append(wiki.page(doc_name).content[:2000])

for doc_name in weather_documents[5:]+sports_documents[5:]:
  test_data.append(wiki.page(doc_name).content[:2000])

In [None]:
print('train data')
print('number of docs', len(train_data))
print('docs labels', train_labels)

train data
number of docs 10
docs labels [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


In [None]:
print('test data')
print('number of docs', len(test_data))
print('docs labels', test_labels)

test data
number of docs 2
docs labels [0, 1]


In [None]:
classes_probability = class_prob(train_labels)

docs = [preprocessing(doc) for doc in train_data]
docs_tokens = [getTokens(doc) for doc in docs]
words_conditional_probability = conditional_prob(docs_tokens, train_labels)

#save probabilities
with open('model.pkl', 'wb') as f:
    pickle.dump(classes_probability, f)
    pickle.dump(words_conditional_probability, f)

In [None]:
# to deal with zero probability, when the token has zero probability in the given class
"""
n_prob : observation probability
N : total observations
K : number of features
alpha : smoothing parameter
"""
def laplace_smoothing_probability(n_prob, N, K, alpha=1):
  return (n_prob*N + alpha) / (N + K * alpha)

In [None]:
def predict(test_doc):
  # load probabilities
  with open('model.pkl', 'rb') as f:
    classes_probability = pickle.load(f)
    words_conditional_prob = pickle.load(f)
  
  # prepare test document
  test_doc = preprocessing(test_doc)
  test_doc_tokens = getTokens(test_doc)
  number_of_tokens = len(test_doc_tokens)

  target_classes_labels = words_conditional_prob.keys()
  number_of_classes = len(target_classes_labels)

  probability_belong_to_target_class = [0]*number_of_classes

  # calculate the probability of class given words using naive bayes approach
  """
  p(c|w1,w2,..wn) = (p(w1|c)*p(w2|c)*..*p(wn|c)*p(c)) / [p(w1|c)*p(w2|c)*..*p(wn|c)*p(c) + p(w1|~c)*p(w2|~c)*..*p(wn|~c)*p(~c)]
  let c1 = p(w1|c)*p(w2|c)*..*p(wn|c)*p(c), c2 = p(w1|~c)*p(w2|~c)*..*p(wn|~c)*p(~c)
  p(c|w1,w2,..wn) = c1 / (c1+c2)
  """
  for label in target_classes_labels:
    c1 = classes_probability[label]
    c2 = 1-classes_probability[label]
    current_class_cond_prob = words_conditional_prob[label]

    for token in test_doc_tokens:
      # if token not appear in the give class, then it has zero probability in the given class
      token_prob_given_class = 0
      if token in current_class_cond_prob:
        token_prob_given_class = current_class_cond_prob[token]
      # as N unknown we will assume it as constant
      c1 *= laplace_smoothing_probability(token_prob_given_class, N=1, K=number_of_tokens, alpha=1)
      c2 *= laplace_smoothing_probability(1-token_prob_given_class, N=1, K=number_of_tokens, alpha=1)
    
    probability_belong_to_target_class[label] = c1/(c1+c2)
  
  max_probability = max(probability_belong_to_target_class)
  winner_class = probability_belong_to_target_class.index(max_probability)
  return probability_belong_to_target_class, winner_class

In [None]:
for x in test_data:
  probs, winner_class = predict(x)
  print('probabilities:', probs)
  print('belong to class:', winner_class)

probabilities: [2.969778618025228e-12, 9.437056727497458e-34]
belong to class: 0
probabilities: [2.7996882271025565e-22, 2.6871434370563706e-15]
belong to class: 1
