# EMAIL SPAM DETECTION WITH MACHINE LEARNING

In [2]:
import pandas as pd
df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")

In [3]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [17]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)

In [18]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [19]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [20]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
X = df["v2"]

In [22]:
y = df["v1"]

In [127]:
import tensorflow as tf
import numpy as np
X_tensor = tf.constant(X)
y_tensor = tf.constant(y)
X_tensor.shape, y_tensor.shape, X_tensor[:10], y_tensor[:10]
X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

In [128]:
y.dtype

dtype('O')

In [139]:
from sklearn import preprocessing
binarizer = preprocessing.LabelBinarizer()
df["target"] = binarizer.fit_transform(df["v1"].values)
y_label_binarized = df["target"]

In [140]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y_label_binarized, test_size = 0.2, random_state = 42)

In [141]:
y_train.dtype

dtype('int64')

In [142]:
len(X_test), len(y_test), len(X_train), len(y_train)

(1115, 1115, 4457, 4457)

In [143]:
y_train[:10]

1978    0
3989    1
3935    0
4078    0
4086    1
4919    0
2268    1
4696    0
3653    0
70      0
Name: target, dtype: int64

### Converting text into numbers

In [144]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Using default text vectorization params.

text_vectorizer = TextVectorization(max_tokens = None, # describes how many different words are there in the vocabulary. (automatically add <OOV> means out of vocabulary) setting this param to None means let the TextVectorization class itself decide the number of different words.
                                    standardize = "lower_and_strip_punctuation",
                                    split = "whitespace",
                                    ngrams = None, #create group of n-words.
                                    output_mode = "int", # how to map tokens to numbers.
                                    output_sequence_length = None, # how long do you want your sequences to be (like batches)
                                    pad_to_max_tokens = False) 

In [145]:
round(sum([len(i.split()) for i in X_train]) / len(X_train))

15

In [146]:
# Setup Textvectorization variables
max_vocab_length = 10000 # max number of (different) words to have in the vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does a model see)

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = "int",
                                    output_sequence_length = max_length)

In [147]:
text_vectorizer.adapt(X_train)

### Creating an embedding layer

In [148]:
embedding_layer = tf.keras.layers.Embedding(input_dim = max_vocab_length, # set input_shape.
                                            output_dim = 128, # output_Shape.
                                            embeddings_initializer = "uniform",
                                            input_length = max_length) # how long each input is.

### Creating model

In [149]:
# Build model with the functional api

import tensorflow as tf

inputs = tf.keras.layers.Input(shape = (1, ), dtype = tf.string) # inputs are 1-D strings

x = text_vectorizer(inputs) #turn the input texts into numbers

x = embedding_layer(x) #create an embedding of numberized inputs

x = tf.keras.layers.GlobalAveragePooling1D()(x) #condenses the feature vector for each token to one vector.

# x = tf.keras.layers.GlobalMaxPool1D()(x) #takes the max feature weights instead of averaging on them, significantly improved the model accuracy with comapred to average pooling layer 1D

outputs = tf.keras.layers.Dense(1, activation = "sigmoid")(x) #create output layer, want binary outputs hence using sigmoid activation function

model = tf.keras.Model(inputs, outputs, name = "model")


In [153]:
model.compile(loss = "binary_crossentropy",
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ["accuracy"])

In [154]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_7 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d_4   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_9 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
Non-train

In [155]:
model.fit(tf.expand_dims(X_train, axis = 1),
          y_train,
          epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f334919fa30>

### Evaluating on Test Data

In [156]:
model.evaluate(X_test, y_test)



[0.09170356392860413, 0.9820627570152283]