In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 

In [2]:
# dataframe means a table
df = pd.read_csv('data/spam.csv', encoding='utf-8')
df.head()
df.shape

(5572, 2)

In [3]:
# getting the labels counts
df['Category'].value_counts()
# df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
# updating the category column by converting it to 0 and 1
# lambda is a anonymous function, ,, 
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [5]:
# alternative way of lambda function
def get_spam_number(x):
    if x=='spam':
        return 1
    else:
        return 0

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
# splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2, random_state=1 , shuffle=False)

# random_state = 1 -> means the same random split will be generated each time you run the code

In [8]:
X_train.shape
# X_train.head()
X_train[:4]

4833    Rats. Hey did u ever vote for the next themes?
3606                   Jordan got voted out last nite!
4389         see you then, we're all christmassy here!
1191                                     We're done...
Name: Message, dtype: object

In [9]:
y_test.shape
type(X_train.values)

numpy.ndarray

**Bag Of Words : Adding Count Vectorizer Matrix(Sparse Matrix)**

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
# Here coutnvectorizer is a class
# v is an object of the class
v = CountVectorizer()

# fit_transform -> fit the model and then transform it

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7790 sparse matrix of type '<class 'numpy.int64'>'
	with 59331 stored elements in Compressed Sparse Row format>

In [11]:
X_train_cv.toarray()[:2]
X_train_cv.toarray()[:2][0]

X_train_cv.shape
# type(X_train_cv)

X_train_np = X_train_cv.toarray()

In [33]:
# here 7675 numbers of unique vocabulary is created
v.get_feature_names_out().shape

# getting the feature names
v.get_feature_names_out()[7372]
# dir(v)

'vote'

In [13]:
v.vocabulary_

{'rats': 5635,
 'hey': 3470,
 'did': 2331,
 'ever': 2703,
 'vote': 7372,
 'for': 2982,
 'the': 6866,
 'next': 4819,
 'themes': 6876,
 'jordan': 3903,
 'got': 3237,
 'voted': 7373,
 'out': 5065,
 'last': 4076,
 'nite': 4843,
 'see': 6042,
 'you': 7751,
 'then': 6878,
 'we': 7463,
 're': 5646,
 'all': 944,
 'christmassy': 1846,
 'here': 3463,
 'done': 2440,
 'promotion': 5509,
 'number': 4914,
 '8714714': 722,
 'ur': 7251,
 'awarded': 1201,
 'city': 1853,
 'break': 1527,
 'and': 997,
 'could': 2051,
 'win': 7572,
 '200': 359,
 'summer': 6652,
 'shopping': 6165,
 'spree': 6465,
 'every': 2704,
 'wk': 7603,
 'txt': 7144,
 'store': 6552,
 'to': 6982,
 '88039': 727,
 'skilgme': 6255,
 'tscs087147403231winawk': 7114,
 'age16': 893,
 '50perwksub': 568,
 'had': 3336,
 'it': 3809,
 'already': 956,
 'sabarish': 5921,
 'asked': 1129,
 'me': 4469,
 'go': 3197,
 'purity': 5557,
 'of': 4950,
 'friendship': 3045,
 'between': 1369,
 'two': 7142,
 'is': 3797,
 'not': 4885,
 'about': 785,
 'smiling': 630

In [22]:
X_train_np[2331]
X_train[:4]

4833    Rats. Hey did u ever vote for the next themes?
3606                   Jordan got voted out last nite!
4389         see you then, we're all christmassy here!
1191                                     We're done...
Name: Message, dtype: object

In [31]:
np.where(X_train_np[0] != 0)

(array([2331, 2703, 2982, 3470, 4819, 5635, 6866, 6876, 7372]),)

In [29]:
X_train[:4][4833]

'Rats. Hey did u ever vote for the next themes?'

**Implementing ML Model**

In [34]:
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB means Naive Bayes with multiple features

model  = MultinomialNB()
model.fit(X_train_cv, y_train)

In [35]:
X_test_cv = v.transform(X_test)

**Evaluate Model Performance**

In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

# here we have imbalance data, cause we have more ham(not spam) than spam
# so we need to use f1 score

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       980
           1       0.95      0.90      0.93       135

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [47]:
# create train and test accuracy score
print(accuracy_score(y_train, model.predict(X_train_cv)))
print(accuracy_score(y_test, y_pred))


# confusion matrix
confusion_matrix(y_test, y_pred)


0.9930446488669509
0.9829596412556054


array([[974,   6],
       [ 13, 122]])

**Testing the Model With Real data**

In [45]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'The IRS is trying to contact you',
    'Verify your bank account',
    'Bitcoin, anyone?',
    'You have a new billing statement'
]

emails_cv = v.transform(emails)
model.predict(emails_cv)

array([0, 0, 1, 0, 1])

**We can do the overall model bulding processs in a more convenient way(easy way) by sklearn pipeline**.

Here count vectorization will be automatically done

In [49]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

In [51]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       980
           1       0.95      0.90      0.93       135

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

