
# Preprocessing Data



**In this notebook, we'll be generating our training and test data while also creating word embeddings that will be used
for our ML models.**

**Load Libraries**


In [7]:

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import text, sequence
import pickle
import pandas as pd
import numpy as np



**Load Data**


In [8]:

with open("input/clean.txt", "rb") as f:
    text_results = pickle.load(f)

dataset =  pd.read_csv("input/master_dataset.csv")

print(len(text_results))


44898


**Split Training, Validation and Test Data**


In [9]:
inputs = np.array(text_results)
labels = dataset['Category']

X_train, X_test,y_train, y_test = train_test_split(inputs,labels, test_size=0.3, random_state=0)


**Tokenize Text and Generate Sequences**

**Tokenize Text**

In [10]:
max_words = 10000
tokenizer = text.Tokenizer(num_words=max_words)
# Tokenize words found in training dataset 
tokenizer.fit_on_texts(X_train)


**Create Sequence of Tokens**

In [13]:
X_train_sequence = tokenizer.texts_to_sequences(X_train) #Transform tokens to integers
X_test_sequence = tokenizer.texts_to_sequences(X_test)


**Pad Sequences**


In [14]:
length = 400
X_train_padding = sequence.pad_sequences(X_train_sequence,maxlen=length)
X_test_padding = sequence.pad_sequences(X_test_sequence,maxlen=length)


**Apply Word Embeddings - GloVE**


**Load Pre-trained GloVE word vectors**


**Important:** Please head to https://nlp.stanford.edu/projects/glove/ to download the **twitter.27B.100d** pre-trained word vectors.

In [20]:
GLOVE_WORD_VECTOR = "input/glove.twitter.27B.100d.model"


**Extract Word Vectors**

In [21]:
embedding_vectors = {}

with open(GLOVE_WORD_VECTOR, 'r', encoding='utf-8') as f:
    for r in f:
        val = r.split(' ')
        word = val[0]
        weights = np.asarray([float(val) for val in val[1:]])
        embedding_vectors[word] = weights
        
print("Vocab Size in GloVE:", len(embedding_vectors))


Vocab Size in GloVE: 1193515


**Generate Embedding Matrix**


In [24]:

word_index = tokenizer.word_index
embedding_dimensions = 100

if max_words is not None:
    vocabulary_length = max_words
    
else:
    vocabulary_length = len(word_index) + 1
    
    
# Initialize Embedding Matrix With Zeros

embedding_matrix = np.zeros((vocabulary_length, embedding_dimensions))

# Embed Matrix With Word Vectors

for word, i in word_index.items():
    if i < vocabulary_length:
        word_vector = embedding_vectors.get(word)
        if word_vector is not None:
            embedding_matrix[i] = word_vector



**Save Training Data, Tokenizer and Embedding Matrix as Pickle Files**


In [27]:

# Saving our training and testing makes the data shareable and frees up resources

# Save Training & Test Data
with open("input/train.pkl", "wb") as f:
    pickle.dump([X_train_padding,X_test_padding,y_train,y_test],f)
    
#Save Embedding Matrix    
    
with open("input/matrix.pkl", "wb") as f:
    pickle.dump([embedding_matrix, embedding_dimensions,vocabulary_length], f)
    
# Saving the tokenizer avoids us having to create one from scratch every time we do inferences. 
with open("input/tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer,file,protocol=pickle.HIGHEST_PROTOCOL)
