In [36]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / 'datasets'
EXPORT_DIR.mkdir(exist_ok=True, parents=True)

SPAM_DATASET_PATH = EXPORT_DIR / 'spam-dataset.csv'


METADADA_EXPORT_PATH = EXPORT_DIR / 'spam-metadata.pkl'

TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'spam-tokenizer.json'

In [5]:
df = pd.read_csv(SPAM_DATASET_PATH)

In [6]:
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [7]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [8]:
labels[120], texts[120]

('spam',
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [9]:
label_legend = {'ham': 0, 'spam': 1}
label_legend_inverted = {f'{v}': k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [10]:
labels_as_int = [label_legend[x] for x in labels]
label_legend_inverted[str(labels_as_int[120])]

'spam'

In [11]:
random_idx = random.randint(0, len(labels))
assert texts[random_idx] == df.iloc[random_idx].text
assert labels[random_idx] == df.iloc[random_idx].label
# assert label invert too

In [12]:
MAX_NUM_WORDS = 280

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [14]:
MAX_SEO_LENGTH = 300
X = pad_sequences(sequences, maxlen=MAX_SEO_LENGTH)
X

array([[  0,   0,   0, ...,  77,  68, 187],
       [  0,   0,   0, ...,   0,  64,   8],
       [  0,   0,   0, ...,   2, 110, 104],
       ...,
       [  0,   0,   0, ...,  15,   6, 137],
       [  0,   0,   0, ..., 180,  50,  50],
       [  0,   0,   0, ..., 190, 241,  19]], dtype=int32)

In [19]:
import numpy as np
from tensorflow.keras.utils import to_categorical

labels_as_int_array = np.array(labels_as_int)
y = to_categorical(labels_as_int_array)
y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [20]:
X, y

(array([[  0,   0,   0, ...,  77,  68, 187],
        [  0,   0,   0, ...,   0,  64,   8],
        [  0,   0,   0, ...,   2, 110, 104],
        ...,
        [  0,   0,   0, ...,  15,   6, 137],
        [  0,   0,   0, ..., 180,  50,  50],
        [  0,   0,   0, ..., 190, 241,  19]], dtype=int32),
 array([[1., 0.],
        [1., 0.],
        [0., 1.],
        ...,
        [0., 1.],
        [0., 1.],
        [1., 0.]], dtype=float32))

In [22]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl (9.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m31m23.6 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.1 threadpoolctl-3.1.0


In [24]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



In [27]:
import pickle 

In [38]:
training_data = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEO_LENGTH,
    "label_legend": label_legend,
    
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [40]:
with open(METADADA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)