In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [4]:
spam = pd.read_csv("./data/spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [7]:
messages = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(messages, y, test_size=0.3)

In [8]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [9]:
X_train = pad_sequences(X_train, padding="post", maxlen=500)
X_test = pad_sequences(X_test, padding="post", maxlen=500)

In [11]:
modelo = Sequential()
modelo.add(Embedding(input_dim=len(token.word_index),output_dim=50))
modelo.add(Flatten())
modelo.add(Dense(units=10, activation="relu"))
modelo.add(Dropout(0.1))
modelo.add(Dense(units=1, activation='sigmoid'))

In [13]:
modelo.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [14]:
modelo.summary()

In [15]:
modelo.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_test,y_test))

Epoch 1/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8460 - loss: 0.1272 - val_accuracy: 0.8798 - val_loss: 0.0458
Epoch 2/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8836 - loss: 0.0597 - val_accuracy: 0.9773 - val_loss: 0.0356
Epoch 3/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9855 - loss: 0.0381 - val_accuracy: 0.9827 - val_loss: 0.0291
Epoch 4/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9807 - loss: 0.0248 - val_accuracy: 0.9850 - val_loss: 0.0128
Epoch 5/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9902 - loss: 0.0098 - val_accuracy: 0.9856 - val_loss: 0.0118
Epoch 6/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9918 - loss: 0.0081 - val_accuracy: 0.9874 - val_loss: 0.0115
Epoch 7/20
[1m390/390

<keras.src.callbacks.history.History at 0x77c48505dc10>

In [16]:
loss, accuracy = modelo.evaluate(X_test,y_test)
print("Loss: ", loss)
print("Acurácia: ", accuracy)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9836 - loss: 0.0143 
Loss:  0.015592253766953945
Acurácia:  0.9820573925971985


In [17]:
nova_previsao = modelo.predict(X_test)
print(nova_previsao)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[[1.7358290e-07]
 [1.0000000e+00]
 [5.3051313e-11]
 ...
 [1.2240838e-05]
 [1.4677759e-07]
 [1.5008746e-06]]


In [18]:
prev = (nova_previsao > 0.5)
print(prev)

[[False]
 [ True]
 [False]
 ...
 [False]
 [False]
 [False]]


In [19]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1454   17]
 [  13  188]]
