# Convert Dataset To Vectors, Train-Test Split, and Export

- We need to combine our datasets and turn them into actual vectors

## Loading Our Dataset

In [1]:
# Dependencies
import os
import random
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
# For Tokenizing texts
from tensorflow.keras.preprocessing.text import Tokenizer
# For uniforming our token vectors
from tensorflow.keras.preprocessing.sequence import pad_sequences
# For hot-encoding categorical data
from tensorflow.keras.utils import to_categorical
# For splitting training dataset
from sklearn.model_selection import train_test_split

In [2]:
# Datasets directories
PROJ_DIR = Path().resolve().parent
DATASETS_DIR = os.path.join(PROJ_DIR, "datasets")
EXPORTS_DIR = os.path.join(DATASETS_DIR, "exports")
SPAM_DATASET_PATH = os.path.join(EXPORTS_DIR, "spam-dataset.csv")
METADATA_PKL_PATH = os.path.join(EXPORTS_DIR, "spam-metadata.pkl")
TOKENIZER_JSON_PATH = os.path.join(EXPORTS_DIR, "spam-tokenizer.json")

In [3]:
# Import th dataset into a Pandas Dataframe
spam_df = pd.read_csv(SPAM_DATASET_PATH)

# Check result
display(spam_df.shape)
display(spam_df.head())
display(spam_df.tail())

(7528, 3)

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


Unnamed: 0,label,text,source
7523,ham,I love this song because we sing it at Camp al...,youtube-spam
7524,ham,I love this song for two reasons: 1.it is abou...,youtube-spam
7525,ham,wow,youtube-spam
7526,ham,Shakira u are so wiredo,youtube-spam
7527,ham,Shakira is the best dancer,youtube-spam


## Converting Labels Into Integers

In [4]:
# Get a list of the elements in the "label" and "text" columns
labels = spam_df["label"].tolist()
texts = spam_df["text"].tolist()

# The position in the list should not change from the position in the dataframe
print(labels[3], "|", texts[3])

ham | U dun say so early hor... U c already then say...


In [5]:
# Map the labels to equivalent integers
labels_to_int_mapping = { "ham": 0, "spam": 1 }
int_to_labels_mapping = { val: key for (key, val) in labels_to_int_mapping.items() }
labels_as_int = [ labels_to_int_mapping[lbl] for lbl in labels ]

We want to verify here that our data manipulation so far is working as expected

- To make sure that we did not mess up the indices
- Because the indices are very important
- They represent the true value of the data

In [6]:
# Grab a random entry in the dataset
random_index = random.randint(0, len(labels))

# Verify that its value in the list matches its value in the dataframe
assert texts[random_index] == spam_df[["text"]].iloc[random_index].text
assert labels[random_index] == spam_df[["label"]].iloc[random_index].label

# Also verify that working with the mapping is as exepcted
assert int_to_labels_mapping[labels_as_int[random_index]] == spam_df[["label"]].iloc[random_index].label

## Tokenizing The Texts

- For tokenizing the texts, we will use the `Tokenizer` from `tensorflow.keras.preprocessing.text`

In [7]:
# Maximum number of words to consider as tokens in the text: 280
# 280 is the maximum length of a Tweet
MAX_NUM_WORDS = 280

# Instantiate a Tokenizer object
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)

# Call the tokenizer to apply on our texts
tokenizer.fit_on_texts(texts)

# Grab the sequences from the fitted tokenizer
sequences = tokenizer.texts_to_sequences(texts)

# Check result
print(sequences)

[[55, 66, 10, 123, 143, 204, 169, 77, 68, 187], [64, 8], [59, 10, 25, 4, 2, 211, 95, 2, 2, 110, 104], [8, 182, 21, 8, 181, 185, 67, 182], [1, 121, 124, 80, 2, 80, 263, 118], [94, 77, 175, 136, 129, 31, 6, 44, 101, 38, 125, 3, 41, 14, 13, 92, 64, 2, 93, 2], [208, 7, 9, 32, 38, 2, 40, 12, 113, 12, 38], [76, 212, 18, 120, 136, 76, 18, 14, 49, 2, 18, 276], [76, 4, 3, 20, 136, 2, 199, 2, 174, 26, 174, 66], [166, 18, 141, 37, 114, 8, 111, 2, 2, 5, 40, 14, 59, 26, 5, 141, 249, 59, 16], [42, 254, 33, 91, 245, 6, 1, 121, 79, 2, 89, 11, 135, 267, 96], [2, 211, 209, 47, 2, 196, 110, 6, 93, 2, 255, 78, 134, 56], [257, 3, 20, 216, 4, 90, 159, 59, 10, 88, 196, 199, 110, 5, 174, 2, 44, 104, 181, 82], [267, 136, 14, 5, 170, 2, 195, 3, 14, 11, 1, 1, 107, 18, 177, 14, 6, 35, 7, 3, 20, 136, 6, 4, 34, 49], [1, 20, 4, 16, 40, 35], [2, 18, 5, 10, 5, 246, 110, 186, 37, 118, 122, 50, 123], [163, 135, 42, 264, 118], [8, 53, 25, 225, 268, 183, 1, 147, 80, 206, 100, 1, 206], [27, 5, 144, 8, 251, 5, 144, 69, 213]

- The `sequences` is a vectorized-representation of each of the texts from our original dataset
- We can check the unique token-to-ord that are present in here with `tokenizer.word_index`

In [8]:
# Check the unique token-to-word mapping
display(tokenizer.word_index)

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'and': 6,
 'my': 7,
 'u': 8,
 'is': 9,
 'in': 10,
 'this': 11,
 'me': 12,
 'it': 13,
 'for': 14,
 'of': 15,
 'on': 16,
 'out': 17,
 'your': 18,
 '\ufeff': 19,
 'have': 20,
 'so': 21,
 'that': 22,
 'check': 23,
 'are': 24,
 '2': 25,
 'call': 26,
 'if': 27,
 'but': 28,
 'can': 29,
 'just': 30,
 'now': 31,
 'not': 32,
 'be': 33,
 'at': 34,
 'will': 35,
 'do': 36,
 'or': 37,
 'like': 38,
 'get': 39,
 'with': 40,
 'up': 41,
 "i'm": 42,
 'we': 43,
 'no': 44,
 'love': 45,
 'ur': 46,
 'from': 47,
 'please': 48,
 'all': 49,
 'com': 50,
 'lt': 51,
 'gt': 52,
 'how': 53,
 'when': 54,
 'go': 55,
 '4': 56,
 'video': 57,
 'know': 58,
 'free': 59,
 'am': 60,
 'what': 61,
 'good': 62,
 'was': 63,
 'ok': 64,
 'time': 65,
 'only': 66,
 'then': 67,
 'got': 68,
 'its': 69,
 'song': 70,
 'come': 71,
 '39': 72,
 'youtube': 73,
 'new': 74,
 'br': 75,
 'as': 76,
 'there': 77,
 'day': 78,
 'want': 79,
 'he': 80,
 'one': 81,
 'www': 82,
 'by': 83,
 'amp': 84,
 

## Finalizing the Training Data

- The challenge that we have now though is that if we want to do matrix multiplication, we want all our vectors to be of the same length
- To fix that, we want to "pad" the `sequences`
- We can fix this using `pad_sequences` from `tensorflow.keras.preprocessing.sequence`

In [9]:
# Set the max sequence length
# This might depend on how long we want the sequence to be
# This can remove some words/punctuations
MAX_SEQUENCE_LENGTH = 300

# Building our final Training Input X
X = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)

# Check result
display(X)

array([[  0,   0,   0, ...,  77,  68, 187],
       [  0,   0,   0, ...,   0,  64,   8],
       [  0,   0,   0, ...,   2, 110, 104],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   8,  24,  21],
       [  0,   0,   0, ...,   9,   5, 156]])

We also need to finalize our labels

- We use `to_categorical()` from `tensorflow.keras.utils` for handling categorical data
- `to_categorical()` will handle *One-Hot Encoding* of the categorical options as well

In [10]:
# Convert labels_as_int into a Numpy array: to_categorical() expect a Numpy array as argument
labels_as_int_array = np.asarray(labels_as_int)

# Building our final Training Target y
y = to_categorical(labels_as_int_array)

# Check result
display(y)

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

## Split Training Data Into Training and Testing Sets

Why do we have to split our training data?

- We need to have variance in our training data, or else we would be too focused on one way of being "right"
- Splitting the training data allows us to have multiple sets of tests to verify that our algorithm is right across variant datasets

We will use `train_test_split` from `sklearn.model_selection` for this process

In [11]:
# Splitting the Training Data into Training Set and Testing Set
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size = 0.33, 
    random_state = 777
)

## Export All

We will now export all of this splitted datasets using `pickle` so they are ready for the next phase

**Warning About `pickle`**

- It is possible for outputs of `pickle` to contain malicious data
- If someone gives you a pickle file, be wary of where it came from or you might infest your system
- Only run pickle files from trusted sources
  - It is fine if you are the one manipulating the data so you know where the data came from
  - But do not use pickle file from someone else
  - Another option is to simply ask them as `csv` files

In [12]:
# Pickle export into JSON format
# We need to convert whatever we want to export into JSON-Like format: Dictionary

# Dictionary of the training data
training_data = {
    "X_train": X_train, 
    "X_test": X_test, 
    "y_train": y_train, 
    "y_test": y_test,
    "max_num_words": MAX_NUM_WORDS,
    "max_sequence_length": MAX_SEQUENCE_LENGTH,
    "labels_to_int_mapping": labels_to_int_mapping,
    "int_to_labels_mapping": int_to_labels_mapping
}

# Also export our tokenizer
tokenizer_json = tokenizer.to_json()

# Export the training_data as a pickle file: Pickle file must be in binary
with open(METADATA_PKL_PATH, 'wb') as pickle_file:
    pickle.dump(training_data, pickle_file)
    
# Export the tokenizer as a JSON file
with open(TOKENIZER_JSON_PATH, 'w') as json_file:
    json_file.write(tokenizer_json)