In [1]:
import re
from copy import copy
import nltk
from nltk.corpus import stopwords
import numpy as np
import pymorphy2
from math import log

In [2]:
path = "./data/SMSSpamCollection"
stopWords = set(stopwords.words('english'))
morph = pymorphy2.MorphAnalyzer()

Препроцессинг данных. Чистим строки от мусора, приводим слова в инфинив и в нижний регистр.

In [3]:
def isCyrWord(word):
    return word.isalpha()

def normalizer(text):
    lower = (word.lower() for word in nltk.wordpunct_tokenize(text))
    cyr = (word for word in lower if isCyrWord(word))
    norm_form = (morph.parse(word)[0].normal_form for word in cyr)
    return ' '.join(norm_form)

def text_proccesing(text, stopWords):
    text_procces = ""
    
    text = re.search('\t.*\\n', line)[0][1:-1]
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text)
    text = re.sub('@[^\s]+', 'USER', text)
    text = re.sub('[^a-zA-Zа-яА-Я]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    
    for t in text.split():
        if t not in stopWords:
            text_procces = " ".join((text_procces, t))
    return text_procces

Считываем данные и обрабатываем их (препроцессинг).

In [4]:
data = {"labels": [], "text": []}
with open(path, 'r', encoding='utf-8') as f:
    data_file = f.readlines()
    for line in data_file:
        data['labels'].append(re.match('.*\\t', line)[0][:-1])
        data['text'].append(normalizer(text_proccesing(line, stopWords)))

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

Делим выборку на train и valid. Смотрим как работает Наивный Байес из коробки

In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(data['text'], data['labels'], test_size=0.2, random_state=43)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_valid = np.array(x_valid)
y_valid = np.array(y_valid)
clf = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
clf.fit(x_train, y_train)
pred = clf.predict(x_valid)
score = metrics.f1_score(y_valid, pred, average="micro")
print(f"F1 Score: {score}")
#temp_valid = np.array([0 if t=='ham' else 1 for t in y_valid])
#temp_pred = np.array([0 if t=='ham' else 1 for t in pred])
#print(metrics.accuracy_score(temp_valid, temp_pred))

F1 Score: 0.9605381165919282


Функция кроссвалидации

In [7]:
def kcrossvalid(labels, text, kfold = 5):
    X = np.array(text)
    Y = np.array(labels)
    if len(X) == len(Y):
        klen = len(X) // kfold
        shuffled = np.array(list(range(len(X))))
        np.random.shuffle(shuffled)
        folds = []
        for i in range(kfold-1):
            folds.append(shuffled[klen*i:klen*(i+1)])
        folds.append(shuffled[klen*(kfold-1):])
        folds = np.array(folds)
        
        res_roc = []
        res_acc = []
        res_f1 = []
        for i in range(kfold):
            test = folds[i]
            h = list(range(kfold))
            h.pop(i)
            train = np.concatenate(folds[h])
            cls = NB_classifier()
            cls.fit(X[train], Y[train])
            pred = cls.predict(X[test])
            pred_ = [1 if l =="spam" else 0 for l in pred]
            y_valid_ = [1 if l =="spam" else 0 for l in Y[test]]
            res_roc.append( metrics.roc_auc_score(y_valid_, pred_) )
            res_f1.append( metrics.f1_score(y_valid_, pred_) )
            res_acc.append( metrics.accuracy_score(y_valid_, pred_) )
        return np.array(res_roc), np.array(res_f1), np.array(res_acc)

Класс нашего Наивного Байеса.

In [8]:
#topics = {'ham': 0, 'spam': 0}
class NB_classifier():
    def __init__(self):
        self.topics = None
        self.words = None
        
        
    def fit(self, X, Y, topics_c = {'ham': 0, 'spam': 0}):
        topics = copy(topics_c)
        wtopics = copy(topics)
        words = {}
        labels = Y
        lines = X
        for i in range(len(lines)):
            topics[labels[i]] += 1
            for w in lines[i].split():
            
                if w in words.keys():
                    words[w][labels[i]] += 1
                else:
                    words[w] = copy(wtopics)
                    words[w][labels[i]] += 1

        disc = np.array(list(topics.values())).sum()
        topics['spam'] /= disc
        topics['ham'] /= disc
        
        for k in words:
            sum = 0
            for i in words[k]:
                sum += words[k][i]
            for i in words[k]:
                words[k][i]=words[k][i]/sum
        self.words = words
        self.topics = topics
        
    def predict(self, X, Topics = ['spam', 'ham']):
        def probability_log(val):
            if val < 10**(-7):
                val = 10**(-7)
            return -np.log(val)
        
        result = []
        for line in X:
            minimum = 9000000
            current_topic = Topics[0]
            for t in Topics:
                temp = probability_log(self.topics[t])
                for w in line.split():
                    try:
                        temp += probability_log(self.words[w][t])
                    except KeyError:
                        continue
                if temp < minimum:
                    current_topic = t
                    minimum = temp
            result.append(current_topic)
        return result   

Создаем объект нашего классификатора и смотрим на его точность работы.

In [9]:
cls = NB_classifier()
cls.fit(x_train, y_train)
pred = cls.predict(x_valid)
pred_ = [1 if l =="spam" else 0 for l in pred]
y_valid_ = [1 if l =="spam" else 0 for l in y_valid]
metrics.f1_score(y_valid_, pred_)

0.8794326241134752

In [10]:
metrics.roc_auc_score(y_valid_, pred_)

0.8984375

Различные метрики на кроссвалидации

In [11]:
res = kcrossvalid(**data)
print(f"Roc-auc: {res[0]}, average: {res[0].mean()} \n F1_Score: {res[1]}, average: {res[1].mean()} \n Accuracy: {res[2]}, average: {res[2].mean()}")

Roc-auc: [0.92441652 0.91455298 0.93614799 0.90879684 0.85358462], average: 0.907499791179575 
 F1_Score: [0.91333333 0.89855072 0.92907801 0.89138577 0.81781377], average: 0.8900323210255721 
 Accuracy: [0.97666068 0.97486535 0.98204668 0.97396768 0.95974955], average: 0.9734579895491757
