### Problem Statement : Build a spam classifier - primarily a text classification problem

* Text Classification is a NLP problem


* NLP - Natural Language Processing

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np

#### Import Data

In [2]:
df = pd.read_csv('dataset/emails.csv')

In [3]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
df['text'][0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [5]:
df['spam'].unique()

array([1, 0])

In [8]:
print(f'% of Ham : {df["spam"].value_counts()[0]/len(df)*100}%')
print(f'% of Spam : {df["spam"].value_counts()[1]/len(df)*100}%')

% of Ham : 76.11731843575419%
% of Spam : 23.88268156424581%


##### anomaly

### Vectorization or Featurization

#### Count Vector - bag of words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

stringList = ['This is the first sentence.',
              'And this the second.',
              'How many sentences do I have?',
              'This is yet another sentence which is third sentence.']

In [10]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(stringList)

In [11]:
X

<4x16 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [12]:
X.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1]])

In [13]:
vectorizer.get_feature_names()

['and',
 'another',
 'do',
 'first',
 'have',
 'how',
 'is',
 'many',
 'second',
 'sentence',
 'sentences',
 'the',
 'third',
 'this',
 'which',
 'yet']

* **'This is the first sentence.'**
    * [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0]
    * [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0]



* **'This is yet another sentence which is third sentence.'**

    * [0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1]
    * [0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1]

### Applying count vectorizer to our email dataset

In [14]:
spm_cv = vectorizer.fit_transform(df['text'])

In [15]:
spm_cv.shape

(5728, 37303)

In [16]:
spm_cv

<5728x37303 sparse matrix of type '<class 'numpy.int64'>'
	with 708380 stored elements in Compressed Sparse Row format>

In [17]:
df.shape

(5728, 2)

In [18]:
vectorizer.get_feature_names()

['00',
 '000',
 '0000',
 '000000',
 '00000000',
 '0000000000',
 '000000000003619',
 '000000000003991',
 '000000000003997',
 '000000000005168',
 '000000000005409',
 '000000000005411',
 '000000000005412',
 '000000000005413',
 '000000000005820',
 '000000000006238',
 '000000000006452',
 '000000000007494',
 '000000000007498',
 '000000000007876',
 '000000000010552',
 '000000000011185',
 '000000000012677',
 '000000000012734',
 '000000000012735',
 '000000000012736',
 '000000000012738',
 '000000000012741',
 '000000000012987',
 '000000000013085',
 '000000000013287',
 '000000000015384',
 '000000000015793',
 '000000000023619',
 '000000000024099',
 '000000000025307',
 '000000000025312',
 '000010220',
 '0000102317',
 '0000102374',
 '0000102789',
 '0000104281',
 '0000104282',
 '0000104486',
 '0000104631',
 '0000104730',
 '0000104776',
 '0000104778',
 '0000107043',
 '0000108729',
 '000066',
 '0001',
 '000166',
 '0002',
 '000202',
 '0003',
 '0004',
 '0005',
 '0006',
 '00076',
 '0009249480',
 '000924948

#### Train Algo on Dataset

In [19]:
from sklearn.naive_bayes import MultinomialNB

nbClx = MultinomialNB()

labels = df['spam'].values

In [20]:
nbClx.fit(spm_cv,labels)

MultinomialNB()

In [27]:
test = ['Free Money !!!',
        'You have earned $1289323498234 in your account',
        'Machine Learning is a great subject but tech folks also need to learn engineering.',
        'Hey ! Earn money while sitting in comfort of your home.']

In [28]:
test_cv = vectorizer.transform(test)

In [29]:
test_cv

<4x37303 sparse matrix of type '<class 'numpy.int64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [30]:
test_pred = nbClx.predict(test_cv)

In [31]:
test_pred

array([1, 1, 0, 1])

#### Training model by splitting

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spm_cv,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)

In [33]:
nbClx2 = MultinomialNB()

nbClx2.fit(X_train,y_train)

MultinomialNB()

#### Evaluate the model

In [35]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_train = nbClx2.predict(X_train)

print(f'Train Accuracy Score :\n {accuracy_score(y_train,y_pred_train)}')
print(f'\n\nConfusion Matrix :\n {confusion_matrix(y_train,y_pred_train)}')
print(f'\n\nClassification Report :\n {classification_report(y_train,y_pred_train)}')


Train Accuracy Score :
 0.9975993016150153


Confusion Matrix :
 [[3494   10]
 [   1 1077]]


Classification Report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
           1       0.99      1.00      0.99      1078

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582



In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_test = nbClx2.predict(X_test)

print(f'Test Accuracy Score :\n {accuracy_score(y_test,y_pred_test)}')
print(f'\n\nConfusion Matrix :\n {confusion_matrix(y_test,y_pred_test)}')
print(f'\n\nClassification Report :\n {classification_report(y_test,y_pred_test)}')


Test Accuracy Score :
 0.9912739965095986


Confusion Matrix :
 [[848   8]
 [  2 288]]


Classification Report :
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       856
           1       0.97      0.99      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



# Great Job !