In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from keras import Sequential, Input
from keras.layers import LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_data = pd.read_csv('dataset/train_preprocessed.csv', encoding='utf-8')
train_data.dropna()

Unnamed: 0,text,class
0,unless request information withheld comply fer...,5
1,none collect personal information computer e m...,5
2,elect location based search saved history stor...,6
3,subsidiary corporate affiliate including enfor...,5
4,use service view content provided google autom...,2
...,...,...
13545,opt targeted advertising,6
13546,web page computer visit using service clickstr...,5
13547,jibjab message sent visiting adjusting email p...,6
13548,receive store certain type information wheneve...,2


In [45]:
Y = pd.get_dummies(train_data['class']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (13550, 10)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(train_data['text'], Y, 
                                                    test_size=0.3, random_state=42)

In [48]:
vectorizer = TextVectorization(max_tokens=4000, output_sequence_length=40)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

In [49]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [50]:
embeddings_index = {}
with open('GloVe/glove.6B.50d.txt', encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [51]:
num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 3729 words (271 misses)


In [52]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [12]:
categories_dict = {}

for i in range(10):
    value = np.zeros((10,))
    value[i] = 1.0
    categories_dict[i] = value

categories_dict

{0: array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 1: array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 2: array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 3: array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 4: array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 5: array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 6: array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 7: array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 8: array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 9: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}

ERROR! Session/line number was not unique in database. History logging moved to new session 94


In [53]:
X_train_transformed = vectorizer(np.array([[s] for s in X_train])).numpy()
X_test_transformed = vectorizer(np.array([[s] for s in X_test])).numpy()

y_train_transformed = np.array(y_train)
y_test_transformed = np.array(y_test)

In [56]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)
print(y_train_transformed.shape)
print(y_test_transformed.shape)

(9485, 40)
(4065, 40)
(9485, 10)
(4065, 10)


In [34]:
y_train_transformed = {}

for key, value in y_train.items():
    y_train_transformed[key] = categories_dict[value]
    
y_train_transformed = pd.Series(data=y_train_transformed)
y_train_transformed = np.array(y_train_transformed)

In [39]:
y_train_transformed[0]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [40]:
X_train_transformed[0]

array([ 453,  526,   91,  942,  458, 2010,    3,    2,  408,  134,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int64)

In [41]:
y_train_transformed

array([array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
       array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
       array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), ...,
       array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
       array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
       array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])], dtype=object)

In [42]:
X_train_transformed

array([[ 453,  526,   91, ...,    0,    0,    0],
       [ 227,   85,    3, ...,  298,  178,  477],
       [ 187,   32,  198, ...,    0,    0,    0],
       ...,
       [ 307,   63, 1580, ...,    0,    0,    0],
       [  97,  417,  362, ...,    0,    0,    0],
       [   7,    2,   38, ...,    0,    0,    0]], dtype=int64)

In [35]:
y_test_transformed = {}

for key, value in y_test.items():
    y_test_transformed[key] = categories_dict[value]
    
y_test_transformed = pd.Series(data=y_test_transformed)
y_test_transformed = np.array(y_test_transformed)

In [71]:
model = Sequential()
model.add(embedding_layer)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 50)          200100    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, None, 50)         0         
 lDropout1D)                                                     
                                                                 
 lstm_6 (LSTM)               (None, 100)               60400     
                                                                 
 dense_10 (Dense)            (None, 32)                3232      
                                                                 
 dense_11 (Dense)            (None, 10)                330       
                                                                 
Total params: 264,062
Trainable params: 63,962
Non-trainable params: 200,100
___________________________________________

In [74]:
model.fit(X_train_transformed, y_train_transformed, batch_size=64, epochs=20, 
          validation_data=(X_test_transformed, y_test_transformed))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x231bbc89fc0>

In [57]:
[print(i.shape, i.dtype) for i in model.inputs]
print("**")
[print(o.shape, o.dtype) for o in model.outputs]
print("**")
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, None) <dtype: 'float32'>
**
(None, None, 10) <dtype: 'float32'>
**
embedding_1 (None, None) float32
lstm_2 (None, None, 50) float32
dropout_4 (None, None, 128) float32
dense_4 (None, None, 128) float32
dropout_5 (None, None, 32) float32
dense_5 (None, None, 32) float32


[None, None, None, None, None, None]