In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [None]:
from google.colab import files
files.upload()

In [6]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
spam.shape

(5572, 2)

In [8]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [9]:
mensagens = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size = 0.3)

In [10]:
token = Tokenizer(num_words = 1000)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [11]:
print(X_train)

[[], [86, 305, 1, 35, 16, 3, 145], [70, 4, 30, 359, 41, 491, 518, 735, 35, 31, 944, 17, 15, 4, 868, 736, 2, 737, 9, 5], [5, 492, 869, 29, 40, 39], [314, 38, 82, 76, 349, 206, 41, 161, 945, 22, 91], [70, 9, 44, 44], [94, 52, 34, 1, 36, 466, 2, 31, 179, 28, 18, 4, 124, 405, 804], [604, 870, 34, 20, 394, 2, 350, 4, 637, 33, 3, 18, 4, 100, 16, 132, 429, 430], [50, 87, 395, 77, 9, 638], [1, 186, 1, 92, 18, 95, 140], [243, 1, 871, 21, 72, 10, 45, 290, 228], [60, 4, 322, 40, 39, 40, 39, 159, 40, 39], [1, 153, 14, 162, 446, 10, 2, 52], [3, 18, 114, 394, 2, 540, 9, 115, 15, 639, 12, 315, 265, 640, 637, 2, 132, 16], [180, 872, 872], [53, 69, 3, 291, 227, 104], [305, 685, 377, 47, 16, 10], [86, 1, 186, 72, 149, 686, 96, 3, 9], [86, 76, 16, 104], [447, 3, 10, 7, 49, 13, 9, 57, 227, 33, 3], [316, 56, 106, 946, 4, 566, 26, 252, 104], [20, 3, 9, 406], [333, 317, 407, 1, 35, 13, 378, 9, 641, 873], [24, 4, 351, 116, 805, 121, 806, 174, 8, 29, 95, 7, 8, 448, 687, 36, 10, 7, 9, 29, 5, 46, 18, 3, 114], [3

In [12]:
X_train = pad_sequences(X_train, padding = "post", maxlen = 500)
X_test = pad_sequences(X_test, padding = "post", maxlen = 500)

In [15]:
len(token.word_index)

7393

In [13]:
X_train

array([[  0,   0,   0, ...,   0,   0,   0],
       [ 86, 305,   1, ...,   0,   0,   0],
       [ 70,   4,  30, ...,   0,   0,   0],
       ...,
       [  1,  61,  24, ...,   0,   0,   0],
       [765,  41, 265, ...,   0,   0,   0],
       [124,  18,   4, ...,   0,   0,   0]], dtype=int32)

In [None]:
token.word_index

In [16]:
modelo = Sequential()
modelo.add(Embedding(input_dim = len(token.word_index), output_dim = 50, input_length = 500))
modelo.add(Flatten())
modelo.add(Dense(units = 10, activation = 'relu'))
modelo.add(Dropout(0.1))
modelo.add(Dense(units = 1, activation = 'sigmoid'))

In [17]:
modelo.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ["accuracy"])

In [18]:
modelo.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           369650    
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 10)                250010    
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 619671 (2.36 MB)
Trainable params: 619671 (2.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
modelo.fit(X_train, y_train, epochs = 20, batch_size = 10, verbose = True, validation_data = (X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f1b41d441f0>

In [20]:
loss, accuracy = modelo.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.01796841248869896
Accuracy:  0.980861246585846


In [21]:
nova_previsao = modelo.predict(X_test)
print(nova_previsao)

[[1.2539088e-10]
 [8.8850585e-13]
 [6.9082930e-14]
 ...
 [4.3877853e-15]
 [4.6594495e-10]
 [1.9234776e-07]]


In [22]:
prev = (nova_previsao > 0.5)
print(prev)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [23]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1442    0]
 [  32  198]]
