# PROBLEM STATEMENT

- The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

- The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.


![image.png](attachment:image.png)

# STEP #0: LIBRARIES IMPORT


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# STEP #1: IMPORT DATASET

In [None]:
spam_df = pd.read_csv("dataset/emails.csv")

In [None]:
spam_df.head(10)

In [None]:
spam_df.tail()

In [None]:
spam_df.describe()

In [None]:
spam_df.info()

# STEP #2: VISUALIZE DATASET

In [None]:
ham = spam_df[spam_df['spam']==0]

In [None]:
spam = spam_df[spam_df['spam']==1]

In [None]:
ham

In [None]:
spam

In [None]:
print( 'Spam percentage =', (len(spam) / len(spam_df) )*100,"%")

In [None]:
print( 'Ham percentage =', (len(ham) / len(spam_df) )*100,"%")

In [None]:
sns.countplot(spam_df['spam'], label = "Count") 

# STEP #3: CREATE TESTING AND TRAINING DATASET/DATA CLEANING

# COUNT VECTORIZER EXAMPLE 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
sample_data = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)


In [None]:
print(vectorizer.get_feature_names())


In [None]:
print(X.toarray())  

# LET'S APPLY COUNT VECTORIZER TO OUR SPAM/HAM EXAMPLE

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
spamham_countvectorizer = vectorizer.fit_transform(spam_df['text'])


In [None]:
print(vectorizer.get_feature_names())


In [None]:
print(spamham_countvectorizer.toarray())  

In [None]:
spamham_countvectorizer.shape

# STEP#4: TRAINING THE MODEL WITH ALL DATASET

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
label = spam_df['spam'].values
NB_classifier.fit(spamham_countvectorizer, label)

In [None]:
testing_sample = ['Free money!!!', "Hi Kim, Please let me know if you need any further information. Thanks"]
testing_sample_countvectorizer = vectorizer.transform(testing_sample)


In [None]:
test_predict = NB_classifier.predict(testing_sample_countvectorizer)
test_predict

In [None]:
# Mini Challenge!
testing_sample = ['Hello, I am Ryan, I would like to book a hotel in Bali by January 24th', 'money viagara!!!!!']


In [None]:
testing_sample = ['money viagara!!!!!', "Hello, I am Ryan, I would like to book a hotel in SF by January 24th"]
testing_sample_countvectorizer = vectorizer.transform(testing_sample)
test_predict = NB_classifier.predict(testing_sample_countvectorizer)
test_predict

# STEP#4: DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING

In [None]:
X = spamham_countvectorizer
y = label

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

In [None]:
# from sklearn.naive_bayes import GaussianNB 
# NB_classifier = GaussianNB()
# NB_classifier.fit(X_train, y_train)

# STEP#5: EVALUATING THE MODEL 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
y_predict_train = NB_classifier.predict(X_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True)

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))

# EXCELLENT JOB! NOW YOU BECAME FAMILIAR WITH NAIVE BAYES, GREAT JOB!