In [38]:
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent
DATA_DIR =  BASE_DIR / 'datasets'

EXPORTS_DIR = DATA_DIR / 'exports'
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)

SPAM_DATASET_PATH = EXPORTS_DIR / 'spam-dataset.csv'
SPAM_METADATA_PATH = EXPORTS_DIR / 'spam-metadata.pkl'
SPAM_TOKENIZER_PATH = EXPORTS_DIR / 'spam-tokenizer.json'

In [3]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head(5)

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms
1,ham,Ok lar... Joking wif u oni...,sms
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms
3,ham,U dun say so early hor... U c already then say...,sms
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms


In [6]:
# Turn into lists

labels_text = df['label'].tolist()
text = df['text'].tolist()

# X

In [12]:
label_map = {'ham': 0, 'spam': 1}
label_map_invert = {f'{v}' : k for k,v in label_map.items()}
labels = [label_map[x] for x in labels_text]
label_map_invert

{'0': 'ham', '1': 'spam'}

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
MAX_WORDS = 280
tokenizer = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
#sequences

In [23]:
word_index = tokenizer.word_index
#word_index

In [27]:
MAX_SEQ_LEN = 300
X = pad_sequences(sequences, maxlen=MAX_SEQ_LEN)
X

array([[  0,   0,   0, ...,  77,  68, 187],
       [  0,   0,   0, ...,   0,  64,   8],
       [  0,   0,   0, ...,   2, 110, 104],
       ...,
       [  0,   0,   0, ...,  15,   6, 137],
       [  0,   0,   0, ..., 180,  50,  50],
       [  0,   0,   0, ..., 190, 241,  19]], dtype=int32)

# Y

In [28]:
from tensorflow.keras.utils import to_categorical
import numpy as np

In [31]:
Y = to_categorical(labels)
Y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

# Train-Test split

In [34]:
import pickle
from sklearn.model_selection import train_test_split

In [35]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [41]:
# All data you want to save
metadata = {
    'X_train' : X_train,
    'X_test' : X_test,
    'y_train' : y_train,
    'y_test' : y_test,
    'tokenizer' : tokenizer,
    'MAX_SEQ_LEN' : MAX_SEQ_LEN,
    'MAX_WORDS' : MAX_WORDS,
    
}

# pickle needs bytes
with open(SPAM_METADATA_PATH, 'wb') as f:
    pickle.dump(metadata, f)
    
tokenizer_json = tokenizer.to_json()
SPAM_TOKENIZER_PATH.write_text(tokenizer_json)

1090335