In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

# Create new column (Spam)

In [4]:
# def get_spam_number(x):
#     if x=='spam':
#         return 1
#     return 0

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [6]:
df.shape

(5572, 3)

In [7]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Preprocessing and Train Model

## Train test split

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
type(X_train)

pandas.core.series.Series

In [16]:
X_train[:4]

112                       Going for dinner.msg you after.
508     You should know now. So how's anthony. Are you...
1622    Living is very simple.. Loving is also simple....
3415                              No pic. Please re-send.
Name: Message, dtype: object

In [17]:
type(y_train)

pandas.core.series.Series

In [18]:
y_train[:4]

112     0
508     0
1622    0
3415    0
Name: spam, dtype: int64

In [21]:
type(X_train.values)

numpy.ndarray

## Transform train data to vector

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7847 sparse matrix of type '<class 'numpy.int64'>'
	with 59846 stored elements in Compressed Sparse Row format>

In [33]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [34]:
X_train_cv.shape

(4457, 7847)

In [38]:
v.get_feature_names_out()[1000:1050]

array(['answr', 'antelope', 'antha', 'anthony', 'anti', 'antibiotic',
       'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anyones',
       'anyplaces', 'anythiing', 'anythin', 'anything',
       'anythingtomorrow', 'anytime', 'anyway', 'anyways', 'anywhere',
       'aom', 'apart', 'apartment', 'apes', 'apeshit', 'aphex', 'apnt',
       'apo', 'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'applausestore', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'applying',
       'appointment', 'appointments', 'appreciate', 'appreciated',
       'approaches', 'approaching'], dtype=object)

In [39]:
v.get_feature_names_out().shape

(7847,)

In [40]:
dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_

In [41]:
v.vocabulary_

{'going': 3218,
 'for': 2985,
 'dinner': 2377,
 'msg': 4674,
 'you': 7809,
 'after': 863,
 'should': 6216,
 'know': 4013,
 'now': 4907,
 'so': 6377,
 'how': 3576,
 'anthony': 1003,
 'are': 1066,
 'bringing': 1540,
 'money': 4629,
 've': 7358,
 'school': 6030,
 'fees': 2862,
 'to': 7026,
 'pay': 5198,
 'and': 972,
 'rent': 5793,
 'stuff': 6632,
 'like': 4165,
 'that': 6897,
 'thats': 6900,
 'why': 7597,
 'need': 4785,
 'your': 7814,
 'help': 3456,
 'friend': 3050,
 'in': 3690,
 'living': 4211,
 'is': 3796,
 'very': 7372,
 'simple': 6263,
 'loving': 4292,
 'also': 936,
 'laughing': 4088,
 'too': 7065,
 'winning': 7628,
 'tooo': 7068,
 'but': 1606,
 'being': 1330,
 'difficult': 2361,
 'gud': 3313,
 'nte': 4915,
 'no': 4857,
 'pic': 5277,
 'please': 5327,
 're': 5674,
 'send': 6101,
 'argh': 1074,
 'the': 6901,
 'fuck': 3076,
 'nobody': 4860,
 'town': 7091,
 'got': 3247,
 'rumour': 5943,
 'buy': 1610,
 'apartment': 1023,
 'chennai': 1804,
 'reserve': 5821,
 'ticket': 6980,
 'on': 5003,
 's

In [43]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [44]:
np.where(X_train_np[0]!=0)

(array([ 863, 2377, 2985, 3218, 4674, 7809], dtype=int64),)

In [54]:
X_train[:4]

112                       Going for dinner.msg you after.
508     You should know now. So how's anthony. Are you...
1622    Living is very simple.. Loving is also simple....
3415                              No pic. Please re-send.
Name: Message, dtype: object

In [53]:
X_train_np[0][863]

1

## Train model

In [56]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [57]:
X_test_cv = v.transform(X_test)

In [60]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       991
           1       0.95      0.93      0.94       124

    accuracy                           0.99      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [63]:
emails = [
    'Hey Mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

# Shortcut using Pipeline

In [64]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [65]:
clf.fit(X_train, y_train)

In [67]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       991
           1       0.95      0.93      0.94       124

    accuracy                           0.99      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

