In [1]:
import re
import glob
import random
import math
from collections import defaultdict, Counter

In [2]:
def tokenize(message):
    message = message.lower() #Zamień na małe litery
    all_words = re.findall("[a-z0-9]+", message) #Wyciągnij slowa.
    return set(all_words) #Usuń duplikaty.

In [3]:
def count_words(training_set):
    """zbior treningowy to para (message,is_spam)"""
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

def word_probabilities(counts,total_spams,total_non_spams,k=0.5):
    """zwrocenie 3-elementowej listy zawierajace slowo, prawdopodobienstwo wystapienia w spamie i prawdopodobienstwa nie bycia spamem"""
 
    return [(w,(spam +k)/(total_spams + 2 *k),
            (non_spam + k)/(total_non_spams +2 * k))
            for w,(spam,non_spam) in counts.items()]

In [4]:
def spam_probability(word_probs, message):
    """prawdopodbienstwo wystapienia slow w celu przypisania prawdopodobienstw do wiadomosci"""
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [5]:
class NaiveBayesClassifier:
    def __init__(self,k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self,training_set):
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        #przetworzenie zbioru danych
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,num_spams,num_non_spams,self.k)
        
    def classify(self,message):
        return spam_probability(self.word_probs,message)

In [6]:
path = r"./spam/*/*"
    
def get_subject_data(path):

    data = []
    #Usuwa slowo "Subject" nie zmieniajac reszty
    subject_regex = re.compile(r"^Subject:\s+")

    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

In [7]:
#metoda z pliku machine_learning
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

#dodatkowa metoda, ktora na podstawie twierdzenia bayesa oblicza prawdopodobienstwa spamu
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [8]:
data = get_subject_data(path)
#podzielenie zbioru na treningowy i testowy oraz zbudowanie klasyfikatora
random.seed(0)
train_data,test_data = split_data(data,0.6)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam,classifier.classify(subject)) for subject,is_spam in test_data]
counts = Counter((is_spam,spam_probability > 0.5) for _,is_spam,spam_probability in classified)

classified.sort(key=lambda row: row[2])

In [15]:
#najwieksze prawdopodbienstwo spamu wsrod wiadomosci niebedacych spamem
spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]

#najmniejsze prawdopodobienstwo spamu wsrod wiadomosci bedacych spamem
hammiest_spams = list(filter(lambda row: row[1], classified))[:5]
print('najwieksze prawdopodbienstwo spamu wsrod wiadomosci niebedacych spamem')
print("spammiest_hams", spammiest_hams,'\n')
print('najwieksze prawdopodbienstwo spamu wsrod wiadomosci bedacych spamem')
print("hammiest_spams", hammiest_spams,'\n')

words = sorted(classifier.word_probs, key=p_spam_given_word)

najwieksze prawdopodbienstwo spamu wsrod wiadomosci niebedacych spamem
spammiest_hams [("A-level student sues for £100,000 over 'grade fixing'", False, 0.988370041686534), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.993668571669194), ('The MIME information you requested (last changed 3154 Feb 14)', False, 0.9967700663238405), ('=?iso-8859-1?Q?Matrox_Parhelia=99_now_available?=', False, 0.9972146394289586), ('"I meditated in a cave for 12 years and now I\'m here to tell you', False, 0.9987530113830644)] 

najwieksze prawdopodbienstwo spamu wsrod wiadomosci bedacych spamem
hammiest_spams [('Re: girls', True, 0.0006930833060748901), ('Re: This Weekend', True, 0.0021053315905049923), ('Testing a system, please delete', True, 0.002602944652390981), ('Introducing Chase Platinum for Students with a 0% Introductory APR', True, 0.002685849319642986), ('.Message report from your contact page....//ytu855 rkq', True, 0.0031928142105761594)] 



In [14]:
#najwieksze prawdopodbienstwo spamu
spammiest_words = words[-5:]
#najwieksze prawdopodobienstwo ze nie jest spamem
hammiest_words = words[:5]
print('najwieksze prawdopodbienstwo spamu')
print("spammiest_words", spammiest_words,'\n')
print('najwieksze prawdopodbienstwo ze nie jest spamem')
print("hammiest_words", hammiest_words)

najwieksze prawdopodbienstwo spamu
spammiest_words [('guaranteed', 0.02631578947368421, 0.00028818443804034583), ('zzzz', 0.02982456140350877, 0.00028818443804034583), ('systemworks', 0.03333333333333333, 0.00028818443804034583), ('money', 0.03333333333333333, 0.00028818443804034583), ('adv', 0.03333333333333333, 0.00028818443804034583)] 

najwieksze prawdopodbienstwo ze nie jest spamem
hammiest_words [('satalk', 0.0017543859649122807, 0.05561959654178674), ('spambayes', 0.0017543859649122807, 0.051585014409221905), ('users', 0.0017543859649122807, 0.03544668587896253), ('zzzzteana', 0.0017543859649122807, 0.02968299711815562), ('razor', 0.0017543859649122807, 0.02968299711815562)]
