In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 

In [2]:
# dataframe means a table
df = pd.read_csv('data/spam.csv', encoding='utf-8')
df.head()
df.shape

(5572, 2)

In [3]:
# getting the labels counts
df['Category'].value_counts()
# df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
# updating the category column by converting it to 0 and 1
# lambda is a anonymous function, ,, 
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [5]:
# alternative way of lambda function
def get_spam_number(x):
    if x=='spam':
        return 1
    else:
        return 0

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
# splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2, random_state=1 , shuffle=False)

# random_state = 1 -> means the same random split will be generated each time you run the code

In [8]:
X_train.shape
# X_train.head()
X_train[:4]

2518                               Sorry, I'll call later
2717    House-Maid is the murderer, coz the man was mu...
4824                                              :-) :-)
3104    U so lousy, run already come back then half de...
Name: Message, dtype: object

In [9]:
y_test.shape
type(X_train.values)

numpy.ndarray

**Bag Of Words : Adding Count Vectorizer Matrix(Sparse Matrix)**

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
# Here coutnvectorizer is a class
# v is an object of the class
v = CountVectorizer()

# fit_transform -> fit the model and then transform it

X_train_cv = v.fit_transform(X_train.values).toarray()
X_train_cv
# type(X_train.values)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
X_train_cv.toarray()[:2]
X_train_cv.toarray()[:2][0]

X_train_cv.shape
# type(X_train_cv)

X_train_np = X_train_cv.toarray()

In [12]:
# here 7675 numbers of unique vocabulary is created
v.get_feature_names_out().shape

# getting the feature names
v.get_feature_names_out()[7372]
# dir(v)

'web2mobile'

In [13]:
v.vocabulary_

{'sorry': 6295,
 'll': 4161,
 'call': 1600,
 'later': 4032,
 'house': 3518,
 'maid': 4317,
 'is': 3750,
 'the': 6768,
 'murderer': 4645,
 'coz': 2041,
 'man': 4335,
 'was': 7337,
 'murdered': 4644,
 'on': 4922,
 'lt': 4250,
 'gt': 3267,
 'th': 6751,
 'january': 3797,
 'as': 1070,
 'public': 5468,
 'holiday': 3472,
 'all': 904,
 'govt': 3215,
 'instituitions': 3702,
 'are': 1035,
 'closed': 1845,
 'including': 3650,
 'post': 5312,
 'office': 4891,
 'understand': 7099,
 'so': 6245,
 'lousy': 4224,
 'run': 5827,
 'already': 915,
 'come': 1900,
 'back': 1180,
 'then': 6779,
 'half': 3311,
 'dead': 2190,
 'hee': 3394,
 'moji': 4550,
 'love': 4227,
 'you': 7645,
 'more': 4574,
 'than': 6752,
 'words': 7534,
 'have': 3364,
 'rich': 5755,
 'day': 2183,
 'wat': 7344,
 'doing': 2403,
 'now': 4831,
 'jetton': 3826,
 'ave': 1145,
 'if': 3602,
 'forgot': 2957,
 'oh': 4898,
 'shit': 6067,
 'thought': 6817,
 'that': 6764,
 'your': 7650,
 'trip': 6993,
 'loooooool': 4203,
 'just': 3876,
 'makes': 4327

In [14]:
X_train_np[2331]
X_train[:4]

2518                               Sorry, I'll call later
2717    House-Maid is the murderer, coz the man was mu...
4824                                              :-) :-)
3104    U so lousy, run already come back then half de...
Name: Message, dtype: object

In [15]:
np.where(X_train_np[0] != 0)

(array([1600, 4032, 4161, 6295]),)

In [23]:
X_train[:4][2518]

"Sorry, I'll call later"

**Implementing ML Model**

In [None]:
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB means Naive Bayes with multiple features

model  = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

**Evaluate Model Performance**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

# here we have imbalance data, cause we have more ham(not spam) than spam
# so we need to use f1 score

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       980
           1       0.95      0.90      0.93       135

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
# create train and test accuracy score
print(accuracy_score(y_train, model.predict(X_train_cv)))
print(accuracy_score(y_test, y_pred))


# confusion matrix
confusion_matrix(y_test, y_pred)

0.9930446488669509
0.9829596412556054


array([[974,   6],
       [ 13, 122]])

**Testing the Model With Real data**

In [None]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'The IRS is trying to contact you',
    'Verify your bank account',
    'Bitcoin, anyone?',
    'You have a new billing statement'
]

emails_cv = v.transform(emails)
model.predict(emails_cv)

array([0, 0, 1, 0, 1])

**We can do the overall model bulding processs in a more convenient way(easy way) by sklearn pipeline**.

Here count vectorization will be automatically done

In [None]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       980
           1       0.95      0.90      0.93       135

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

