# Spam Checker

Thanks for the Dataset: [Dataset](https://github.com/codebasics/py/blob/master/ML/14_naive_bayes/spam.csv)

## Import Necessary Libraries

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

## Data Preparation

In [3]:
dataframe = pd.read_csv('../data/spam.csv')
dataframe

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Turn Category into Numeric Data

In [4]:
dataframe['Category'] = dataframe['Category'].astype('category')
dataframe['Category'] = dataframe['Category'].cat.codes
dataframe

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


## Assigning X and y

In [5]:
X = dataframe['Message']
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [6]:
y = dataframe['Category']
y[:5]

0    0
1    0
2    1
3    0
4    0
Name: Category, dtype: int8

## Spitting the Data into Training and Test Data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Cross-Validation and Pipeline Making

In [8]:
# CREATE A PIPELINE
def create_pipeline(naive_bayes):
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('classifier', naive_bayes)
    ])

    return pipeline

In [9]:
models = {
    'Bernoulli': BernoulliNB(),
    'Multinomial': MultinomialNB()
}
for name, model in models.items():
    pipeline = create_pipeline(model)
    scores = cross_val_score(pipeline, X_train, y_train)
    print(f'{name}: {np.mean(scores)}')

Bernoulli: 0.9732992606684684
Multinomial: 0.9831712742774028


## Model Creation

In [10]:
chosen_classifier = MultinomialNB()

In [11]:
pipeline = create_pipeline(chosen_classifier)
pipeline.fit(X_train, y_train)

## Prediction

In [12]:
y_pred = pipeline.predict(X_test)
y_pred[:5]

array([0, 0, 0, 0, 0], dtype=int8)

## Evaluation

In [20]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Score: {accuracy * 100: .2f}')
print()
print(f'Classification Report: {report}')

Score:  99.19

Classification Report:               precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



## Save the Model

In [14]:
with open('../model/spam_checker.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [15]:
import pickle

# Load the fitted model
with open('../model/spam_checker.pkl', 'rb') as f:
    pipeline = pickle.load(f)
