In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
data = pd.read_table('SMSSpamCollection',header = None, names= ['Category', 'Content'])
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,Category,Content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
category = data.Category
content = data.Content

In [4]:
names=list(set(category))

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

X=data["Content"]
y=data["Category"]

kf=KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

trainsize =X_train.shape[0]

In [6]:
#Выбираем индексы категории 
def IndexSelect(name):
    I=[]
    for i in range(trainsize):
        if (category[i]== name):
            I.append(i)
    return I

In [7]:
# Общие слова тестового сообщения и сообщения данной категории
def CommonWords(categ, testdic):
    testcounts=dict.fromkeys(testdic, 0)
    for i in IndexSelect(categ):
        traindic = dict(Counter(content[i].split()))
        for key in traindic:
            if key in testdic:
                testcounts[key]+=traindic[key]
    return (testcounts)

In [8]:
#Вероятность категории 
def Prior (categ):
    return (len(IndexSelect(categ))/len(category))

In [9]:
#Считаем количество слов и формируем словарь категории
def WordsnVoc(categ):
    totalwords=0
    voc=[]
    for i in IndexSelect(categ):
        traindic = dict(Counter(content[i].split()))
        totalwords+=len(traindic.keys())
        voc=list(set(voc+list(traindic.keys())))
    return (totalwords, voc)

In [10]:
values=[]
voc=[]
for categ in names:
    voc= list(set(WordsnVoc(categ)[1]+voc))
    values.append(WordsnVoc(categ)[0])
totalwords=dict(zip(names, values))
voclength=len(voc) 
print(voclength)  #Длина общего словаря
print(totalwords) #Число слов в каждой категории

13712
{'ham': 51321, 'spam': 13742}


In [11]:
def Main(testdic):
    probs=[]
    for categ in names:
        testcounts=CommonWords(categ, testdic)
        condprob=dict.fromkeys(testcounts, 0)
        p=1
        for word in testcounts:
            #вероятность того,что сообщение принадлежит данной категории, при условии, что в ней есть данное слово (+1 - защита от нулей в произведении) 
            condprob[word]=10000*(testcounts[word]+1)/(totalwords[categ] +voclength) 
            p*=condprob[word]  #умножаем для каждого слова в тестовом сообщении  
        p*=Prior(categ)        #и на вероятность категории
        probs.append(p)
    val,idx=max((val, idx) for (idx, val) in enumerate(probs))
    return names[idx]

In [12]:
y_pred=[]

for i in range(len(X_test)):
    testdict=dict(Counter(np.array(X_test)[i].split()))
    answer=Main(testdict)
    np.array(y_pred.append(answer))
    
from sklearn.metrics import accuracy_score
print(f"Accuracy: {accuracy_score(np.array(y_test), y_pred)}" )

Accuracy: 0.9892280071813285


In [13]:
#Сравниваем с реализацией на sklearn
from sklearn.naive_bayes import MultinomialNB
import scipy as sci
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

X_train = sci.sparse.csr_matrix.todense(X_train_counts)
X_test = sci.sparse.csr_matrix.todense(X_test_counts)

model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(np.array(y_test), y_pred)}")

Accuracy: 0.9847396768402155
