<a href="https://colab.research.google.com/github/mauricio201922/DataScience-2/blob/main/Aula2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes

### O que é o algoritmo Naive Bayes?

É uma técnica de classificação baseado no teorema de Bayes com uma suposição de
independência entre os preditores. Em termos simples, um classificador Naive Bayes
assume que a presença de uma característica particular não está relacionada com a
presença de qualquer outra característica.

Por exemplo, um fruto pode ser considerado como uma maçã se é vermelho, redondo, e
tiver um certo diâmetro. Mesmo que essas características dependam umas das outras,
todas contribuem de forma independente para a probabilidade de que este fruto é uma
maçã e é por isso que é conhecido como ‘Naiveʼ (ingênuo).



In [49]:
import re

In [50]:
def tokenize(message):
  message = message.lower()
  all_words = re.findall('[a-z0-9]+', message)
  return set(all_words)


In [51]:
re.findall('[a-z0-9]+', 'Hello World ! ? 9'.lower())

['hello', 'world', '9']

In [52]:
def count_words(training_set):
  """training set consists of pairs (message, is_spam)"""
  counts = defaultdict(lambda: [0, 0])
  for message, is_spam in training_set:
    for word in tokenize(message):
      counts[word][0 if is_spam else 1] += 1
  return counts

In [53]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
  """turn the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam)"""
  return [(w,
           (spams + k) / (total_spams + 2 * k),
           (non_spams + k) / (total_non_spams + 2 * k))
           for w, (spams, non_spams) in counts.items()]

In [54]:
import math

In [55]:
def spam_probabilities(word_probs, message):
  message_words = tokenize(message)
  log_prob_if_spam = log_prob_if_not_spam = 0.0

  # iterate through each word in our vocabulary
  for word, prob_if_spam, prob_if_not_spam in word_probs:

    # if *word* appears in the message, add the log probability of seeing it
    if word in message_words:
      log_prob_if_spam += math.log(prob_if_spam)
      log_prob_if_not_spam += math.log(prob_if_not_spam)

    # if *word* doesn't appear in the message add the log probability of _not_ seeing it
    # which is log(1 - probability of seeing it)
    else:
      log_prob_if_spam += math.log(1.0 - prob_if_spam)
      log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [65]:
class NaiveBayesClassifier:

  def __init__(self, k=0.5):
    self.k = k
    self.word_probs = []

  def train(self, training_set):
    # count spam and non-spam messages
    num_spams = len([is_spam
                    for message, is_spam in training_set
                    if is_spam])
    num_non_spams = len(training_set) - num_spams

    # run training data through our "pipeline"
    word_counts = count_words(training_set)
    self.word_probs = word_probabilities(word_counts, 
                                         num_spams, 
                                         num_non_spams, 
                                         self.k)
    
  def classify(self, message):
    return spam_probabilities(self.word_probs, message)

In [66]:
import glob, re
path = r"/content/drive/MyDrive/Ciência da Computação/4° Ano/2º Semestre/1º Bimestre/Data Science II/Files/*"
data = []

# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
  is_spam = True if "ham" not in fn else False
  with open(fn,'r', encoding="utf8", errors='ignore') as file:
    for line in file:
      if line.startswith("Subject:"):
        # remove the leading "Subject: " and keep what's left
        subject = re.sub(r"^Subject: ", "", line).strip()
        data.append((subject, is_spam))

In [67]:
len(data)

1006

In [68]:
data[-5:-1]

[('[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206', True),
 ('Guaranteed to lose 10-12 lbs in 30 days                          11.150',
  True),
 ('Guaranteed to lose 10-12 lbs in 30 days                          11.150',
  True),
 ('Re: Fw: User Name & Password to Membership To 5 Sites zzzz@spamassassin.taint.org pviqg',
  True)]

In [69]:
import random
from collections import defaultdict

def split_data(data, prob):
  """divide os dados en frações [prob, 1 - prob]"""
  results = [], []
  
  for row in data:
    results[0 if random.random() < prob else 1].append(row)
  return results

In [70]:
random.seed(0)
train_data, test_data = split_data(data, 0.75)

In [71]:
print('Train data size=', len(train_data),
      'Test data size=', len(test_data))

Train data size= 763 Test data size= 243


In [72]:
# criando e treinando um classificador
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [73]:
print(classifier.classify("Life Insurance - Why Pay More?"))
print(classifier.classify("This week: Deck, Tex-Edit Plus, Boom"))
print(classifier.classify("Data Science Class"))

0.6516187870497036
0.6516187870497036
0.6516187870497036


In [74]:
from collections import Counter
import math

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                  for _, is_spam, spam_probability in classified)

In [75]:
#print(classified)
print(counts)

Counter({(True, True): 225, (True, False): 18})


In [76]:
def precision(tp, fp, fn, tn):
  return tp / (tp + fp)
def recall(tp, fp, fn, tn):
  return tp / (tp + fn)
def f1_score(tp, fp, fn, tn):
  p = precision(tp, fp, fn, tn)
  r = recall(tp, fp, fn, tn)
  return 2 * p * r / (p + r)

In [77]:
print("precision = ", precision(101, 33, 38, 704))
print("recall = ", recall(101, 33, 38, 704))
print("f1-score = ", f1_score(101, 33, 38, 704))

precision =  0.753731343283582
recall =  0.7266187050359713
f1-score =  0.73992673992674


In [78]:
classified[:1]

[('[ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148',
  True,
  0.11471610660486675)]

In [79]:
# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams_f = filter(lambda row: not row[1], classified)
spammiest_hams = list(spammiest_hams_f)[-5:]

# the lowest predicted spam probabilities among the actual spams
hammiest_spams_f = filter(lambda row: row[1], classified)
hammiest_spams = list(hammiest_spams_f)[:5]

In [80]:
print(spammiest_hams)

[]


In [81]:
print(hammiest_spams)

[('[ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148', True, 0.11471610660486675), ('[ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148', True, 0.11471610660486675), ('Re: Your bank account', True, 0.11471610660486675), ('Re: Hi', True, 0.11471610660486675), ('Re: PROTECT YOUR COMPUTER AGAINST HARMFUL VIRUSES! 21198', True, 0.11471610660486675)]


In [82]:
def drop_final_s(word):
  return re.sub("s$", "", word)

In [83]:
drop_final_s('hands')

'hand'