In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout
from tensorflow.keras.models import Model


In [2]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

In [3]:
spam = pd.read_csv('./arquivos/spam.csv')
spam.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam.shape

(5572, 2)

In [5]:
count = spam['Category'].value_counts()
print(count)

ham     4825
spam     747
Name: Category, dtype: int64


In [6]:
ham_samples = spam[spam['Category'] == 'ham'].sample(n=747, random_state=SEED)
spam_samples = spam[spam['Category'] == 'spam']

spam = pd.concat([ham_samples, spam_samples]).sample(frac=1, random_state=SEED).reset_index(drop=True)

ham_samples
spam_samples
spam

Unnamed: 0,Category,Message
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD..."
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2,spam,Do you want a new Video handset? 750 any time ...
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty ...
4,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...
...,...,...
1489,spam,December only! Had your mobile 11mths+? You ar...
1490,spam,Loans for any purpose even if you have Bad Cre...
1491,spam,You have an important customer service announc...
1492,spam,URGENT! Your Mobile number has been awarded wi...


In [7]:
spam_samples

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [8]:
spam

Unnamed: 0,Category,Message
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD..."
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2,spam,Do you want a new Video handset? 750 any time ...
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty ...
4,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...
...,...,...
1489,spam,December only! Had your mobile 11mths+? You ar...
1490,spam,Loans for any purpose even if you have Bad Cre...
1491,spam,You have an important customer service announc...
1492,spam,URGENT! Your Mobile number has been awarded wi...


In [9]:
y = LabelEncoder().fit_transform(spam['Category'])
y


array([1, 1, 1, ..., 1, 1, 1])

In [10]:
mensagens = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3, random_state=SEED)