In [1]:
from sklearn import set_config

set_config(print_changed_only=False)

In [2]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/ML/14_naive_bayes/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.sample(n=5)

Unnamed: 0,Category,Message,Spam
3469,ham,yay! finally lol. i missed our cinema trip las...,0
949,ham,Chk in ur belovd ms dict,0
3532,ham,Prepare to be pounded every night...,0
2636,ham,Tiwary to rcb.battle between bang and kochi.,0
2205,ham,Raji..pls do me a favour. Pls convey my Birthd...,0


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Spam'], test_size=0.25)
len(X_train), len(X_test)

(4179, 1393)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_vector = v.fit_transform(X_train.values)
X_train_vector.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vector, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
model.score(v.transform(X_test), y_test)

0.9849246231155779

In [10]:
from sklearn.model_selection import cross_val_score

cross_val_score(model, v.transform(df['Message']), df['Spam'], cv=10)

array([0.97670251, 0.97849462, 0.98025135, 0.97666068, 0.97666068,
       0.98204668, 0.98204668, 0.98563734, 0.97307002, 0.98922801])

### Using Pipeline

In [11]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [12]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
clf.score(X_test, y_test)

0.9849246231155779

In [14]:
cross_val_score(clf, df['Message'], df['Spam'])

array([0.98295964, 0.98565022, 0.98294434, 0.98294434, 0.98653501])