In [1]:
import numpy as np
import pandas as pd

In [33]:
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix

# sklearn naive_bayes

In [22]:
message=pd.read_csv('SMSSpamCollection',sep='\t',names=["label","message"])
message.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
message.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [8]:
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [31]:
message['message'].head(5).apply(text_process)

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [26]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(message['message'])
print(len(bow_transformer.vocabulary_))

9530


In [27]:
messages_bow = bow_transformer.transform(message['message'])
tfidf_transformer=TfidfTransformer().fit(messages_bow)
messages_tfidf=tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(5572, 9530)


In [28]:
msg_train, msg_test, label_train, label_test = train_test_split(messages_tfidf,message['label'], test_size=0.2)

In [29]:
spam_detect_model = MultinomialNB().fit(msg_train, label_train)
predictions = spam_detect_model.predict(msg_test)

In [30]:
print(classification_report(label_test, predictions))
print(confusion_matrix(label_test, predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       971
        spam       1.00      0.76      0.86       144

   micro avg       0.97      0.97      0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

[[971   0]
 [ 35 109]]


# naive bayes "by a hands"

In [39]:
df=pd.read_csv('SMSSpamCollection',sep='\t',names=["label","message"])
df['label'] = df.label.map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
c_vec = CountVectorizer(lowercase=1, min_df=.00001, stop_words='english')
c_vec.fit(df['message'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=1, max_df=1.0, max_features=None, min_df=1e-05,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [41]:
train_df = df[0:4457]
test_df = df[4457:]
test_df.index = (range(test_df.shape[0]))
Y_train = train_df['label'].values
Y_test = test_df['label'].values

In [42]:
def prob_y(Y_train, num_class=2):
    p_y = np.zeros([num_class,])
    n_y = np.zeros([num_class,])
    d_y = Y_train.shape[0]
    for i in range(Y_train.shape[0]):
        n_y[Y_train[i]] = n_y[Y_train[i]] + 1
    p_y = n_y/d_y
    return p_y

In [43]:
p_y = prob_y(Y_train)
p_y
#вероятность не спам/спам в тренировочной выборке

array([0.86493157, 0.13506843])

In [44]:
def prob_xy(c_vec, train_df, Y_train, num_class=2):
    d_y = np.zeros([num_class,]) + len(c_vec.vocabulary_)
    p_xy = np.zeros([num_class, len(c_vec.vocabulary_)])
    for i in np.unique(Y_train):
        temp_df = train_df[train_df['label']==i]
        temp_x = c_vec.transform(temp_df['message'].values)
        n_xy = np.sum(temp_x, axis=0) + 1
        d_y[i] = d_y[i] + np.sum(temp_x)
        p_xy[i] = n_xy/d_y[i] 
    return p_xy

In [45]:
p_xy = prob_xy(c_vec, train_df, Y_train, 2)
p_xy

array([[2.80938334e-05, 2.80938334e-05, 5.61876668e-05, ...,
        5.61876668e-05, 2.80938334e-05, 2.80938334e-05],
       [6.11892974e-04, 1.44629248e-03, 5.56266340e-05, ...,
        5.56266340e-05, 1.11253268e-04, 5.56266340e-05]])

In [46]:
def classify(c_vec, test_df, p_xy, p_y, num_class=2):
    pred = []
    pre_yx = []
    for doc in test_df['message'].values:
        temp_doc = (c_vec.transform([doc])).todense()
        temp_prob = np.zeros([num_class,])
        for i in range(num_class):
            temp_prob[i] = np.prod(np.power(p_xy[i],temp_doc))*p_y[i]
        pred.append(np.argmax(temp_prob))
    return pred

In [47]:
pred = classify(c_vec, test_df, p_xy, p_y, num_class=2)

In [48]:
print(classification_report(Y_test, pred))
print(confusion_matrix(Y_test, pred))
#metrics for my predictions

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       970
           1       0.90      0.97      0.93       145

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.95      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[954  16]
 [  5 140]]


#### а это из sklearn

In [49]:
print(classification_report(label_test, predictions))
print(confusion_matrix(label_test, predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       971
        spam       1.00      0.76      0.86       144

   micro avg       0.97      0.97      0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

[[971   0]
 [ 35 109]]
