### ----------- IMPORTING LIBRARIES -----------

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### ----------- IMPORTING DATASET -----------

In [34]:
data=pd.read_csv("spam.csv")

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### ----------- DATA EXPLORATION -----------

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [18]:
data["Category"].unique()

array(['ham', 'spam'], dtype=object)

In [16]:
data["Category"].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [27]:
data["Message"].value_counts()

Sorry, I'll call later                                                                                                                                      30
I cant pick the phone right now. Pls send a message                                                                                                         12
Ok...                                                                                                                                                       10
Ok                                                                                                                                                           4
Ok.                                                                                                                                                          4
                                                                                                                                                            ..
I gotta collect da car at 6 lei.              

In [22]:
data.groupby(["Category"]).describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [23]:
data.duplicated().sum()
#4825-4516=309
#747-641=106

415

#### TRANSFORMING THE "CATEGORY" COLUMN TO BINARY COLUMN USING "0" AND "1"

##### "1ere methode"

In [53]:
data["Spam"]=data["Category"].apply(lambda x: 1 if x=="ham"else 0)
data.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


##### "2eme methode"

In [37]:
Spam=pd.get_dummies(data["Category"])
Spam.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [48]:
data1=pd.concat((data, Spam["ham"]), axis=1)
data1.rename(columns={"Category":"Category","Message":"Message","ham":"Spam"}, inplace=True)
data1.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


### ----------- SPLITTING DATA -----------

In [28]:
#-----------
from sklearn.model_selection import train_test_split

In [114]:
X=data.Message
Y=data.Spam
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [115]:
X_train.values

array(["Nope but i'll b going 2 sch on fri quite early lor cos mys sis got paper in da morn :-)",
       'WIN: We have a winner! Mr. T. Foley won an iPod! More exciting prizes soon, so keep an eye on ur mobile or visit www.win-82050.co.uk',
       "No I'm good for the movie, is it ok if I leave in an hourish?",
       ..., 'Where is that one day training:-)',
       'Where is it. Is there any opening for mca.',
       'I am going to film 2day da. At 6pm. Sorry da.'], dtype=object)

In [116]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X_train_count=cv.fit_transform(X_train.values)

In [117]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### ----------- MODEL TRAINING -----------

##### Using the Naive Bayes method

In [118]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train_count, Y_train)

MultinomialNB()

##### Testing the model

In [125]:
email_ham=["Hey wanna meet up for the game ?"]
email_ham_count=cv.transform(email_ham)
email_ham_count

<1x7723 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [126]:
model.predict(email_ham_count)

array([1], dtype=int64)

In [127]:
email_spam=["reward money click"]
email_spam_count=cv.transform(email_spam)
email_spam_count

<1x7723 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [128]:
model.predict(email_spam_count)

array([0], dtype=int64)

### ----------- MODEL TESTING -----------

In [132]:
X_test_count=cv.transform(X_test)
model.score(X_test_count,Y_test)

0.9802690582959641