In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing, utils, activations, optimizers
from tensorflow.keras import Input, layers, models

In [2]:
train_data = pd.read_pickle('../data_en_es/en_es_train_data.pkl')
test_data = pd.read_pickle('../data_en_es/en_es_test_data.pkl')

In [3]:
train_data['label'] = train_data['label'].astype(int).astype(str)
test_data['label'] = train_data['label'].astype(int).astype(str)

In [4]:
variables = ['user', 'format', 'token', 'part_of_speech', 'label']
features = ['user', 'format', 'token', 'part_of_speech']

In [5]:
train_listings = pd.Series({feature : train_data[feature].tolist() for feature in variables})
test_listings = pd.Series({feature : train_data[feature].tolist() for feature in variables})

In [6]:
train_listings

user              [XEinXf5+, XEinXf5+, XEinXf5+, XEinXf5+, XEinX...
format            [reverse_translate, reverse_translate, reverse...
token             [i, am, a, boy, i, am, from, mexico, my, name,...
part_of_speech    [PRON, VERB, DET, NOUN, PRON, VERB, ADP, PROPN...
label             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
dtype: object

In [7]:
train_data_listified = train_data.groupby('user')[variables].apply(lambda data: pd.Series({feature : data[feature].tolist() for feature in variables}))
test_data_listified = test_data.groupby('user')[variables].apply(lambda data: pd.Series({feature : data[feature].tolist() for feature in variables}))

In [8]:
train_data_listified

Unnamed: 0_level_0,user,format,token,part_of_speech,label
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
++j955YG,"[++j955YG, ++j955YG, ++j955YG, ++j955YG, ++j95...","[reverse_translate, reverse_translate, reverse...","[am, i, a, boy, i, am, from, mexico, you, are,...","[ADP, PRON, DET, NOUN, PRON, VERB, ADP, PROPN,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
+/iDvu/I,"[+/iDvu/I, +/iDvu/I, +/iDvu/I, +/iDvu/I, +/iDv...","[listen, reverse_tap, listen, listen, reverse_...","[what, what, good, morning, good, night, hello...","[PRON, PRON, ADJ, NOUN, ADJ, NOUN, INTJ, CONJ,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+0UEF02n,"[+0UEF02n, +0UEF02n, +0UEF02n, +0UEF02n, +0UEF...","[reverse_tap, listen, reverse_tap, reverse_tap...","[what, what, hello, and, good, morning, good, ...","[PRON, PRON, INTJ, CONJ, ADJ, NOUN, ADJ, NOUN,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
+197nchq,"[+197nchq, +197nchq, +197nchq, +197nchq, +197n...","[reverse_tap, listen, listen, listen, reverse_...","[what, what, good, morning, good, evening, hel...","[PRON, PRON, ADJ, NOUN, ADJ, NOUN, INTJ, CONJ,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+7lbKZrn,"[+7lbKZrn, +7lbKZrn, +7lbKZrn, +7lbKZrn, +7lbK...","[reverse_translate, listen, listen, listen, re...","[what, what, good, morning, good, night, good,...","[PRON, PRON, ADJ, NOUN, ADJ, NOUN, ADJ, NOUN, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, ..."
...,...,...,...,...,...
zv3rQx2W,"[zv3rQx2W, zv3rQx2W, zv3rQx2W, zv3rQx2W, zv3rQ...","[listen, listen, listen, listen, listen, liste...","[i, speak, english, i, am, sorry, where, i, am...","[PRON, VERB, PROPN, PRON, VERB, ADJ, ADV, PRON...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
zx+JF92P,"[zx+JF92P, zx+JF92P, zx+JF92P, zx+JF92P, zx+JF...","[reverse_translate, listen, listen, listen, li...","[what, what, i, am, very, good, please, i, am,...","[PRON, PRON, PRON, VERB, ADV, ADJ, INTJ, PRON,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
zyRFO2/B,"[zyRFO2/B, zyRFO2/B, zyRFO2/B, zyRFO2/B, zyRFO...","[listen, listen, listen, listen, listen, liste...","[i, am, from, mexico, are, you, from, mexico, ...","[PRON, VERB, ADP, PROPN, VERB, PRON, ADP, PROP...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
zzZcZM/K,"[zzZcZM/K, zzZcZM/K, zzZcZM/K, zzZcZM/K, zzZcZ...","[reverse_tap, listen, reverse_tap, reverse_tap...","[what, what, a, man, good, morning, good, even...","[PRON, PRON, DET, NOUN, ADJ, NOUN, ADJ, NOUN, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
# Create index to string mappings for the features, include an index mapping for padding
# so padding will have index of 0
def feature_mapping(data, pad = "_PAD_"):
    feature_map = {}
    for var in variables:
        feature_map[var] = {}

        unique_features = list(set(data[var]))
        unique_features.insert(0, pad)

        tokens_to_index = {feature: index for index, feature in enumerate(unique_features)}
        feature_map[var] = tokens_to_index

    return feature_map



In [9]:
feature_map_train = feature_mapping(train_data)
feature_map_test = feature_mapping(test_data)

In [10]:
# Vocab sizes
print({var: len(feature_map_train[var]) for var in feature_map_train})
print({var: len(feature_map_test[var]) for var in feature_map_test})

{'user': 2594, 'format': 4, 'token': 1968, 'part_of_speech': 17, 'label': 3}
{'user': 2594, 'format': 4, 'token': 1880, 'part_of_speech': 17, 'label': 3}


In [11]:
max_length_train = train_data['user'].value_counts().max()
max_length_test = test_data['user'].value_counts().max()
max_length_train, max_length_test

(8894, 1057)

In [12]:
# We'll just pad to the length of the longest individual sequence (apparently thats 8894)
def add_padding(sequences, feature_map, maxlen = None):
    index = [[feature_map[feature] for feature in sequence] for sequence in sequences]
    index = preprocessing.sequence.pad_sequences(index, maxlen, value = feature_map["_PAD_"])
    return index

In [13]:
indexed_train_data = {var: add_padding(train_data_listified[var], feature_map_train[var], 2048) for var in feature_map_train}
Y_train = indexed_train_data.pop('label')

indexed_test_data = {var: add_padding(test_data_listified[var], feature_map_test[var], 1024) for var in feature_map_test}
Y_test = indexed_test_data.pop('label')

In [14]:
Y_train_oh = utils.to_categorical(Y_train)
Y_test_oh = utils.to_categorical(Y_test)

In [15]:
Y_train.shape, Y_test.shape

((2593, 2048), (2593, 1024))

In [17]:
indexed_train_data['user'].shape, indexed_test_data['user'].shape

((2593, 8894), (2593, 1057))

In [20]:
Y_train_oh.shape, Y_test_oh.shape

((2593, 2048, 3), (2593, 1024, 3))

In [21]:
def create_embeddings(input_dim, output_dim, input_length = None, name = None):
    input_tensor = Input(shape = (input_length, ), name = name)
    embedding_layer = layers.Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length, mask_zero = True, name = "embedding_{}".format(name))(input_tensor)

    return input_tensor, embedding_layer

In [22]:
def build_embeddings(feature_map, max_length = None):
    input_tensors = {}
    output_tensors = {}

    input_dims = {feature: len(feature_map[feature]) for feature in feature_map}
    
    output_dims = {}
    for feature in features:
        if len(feature_map[feature]) < 10:
            output_dims[feature] = 8
        if 10 <= len(feature_map[feature]) < 1000:
            output_dims[feature] = 64
        elif len(feature_map[feature]) >= 1000:
            output_dims[feature] = 256
  
    for feature in features:
        input_tensors[feature], output_tensors[feature] = create_embeddings(input_dims[feature], output_dims[feature], max_length, feature)

    return input_tensors, output_tensors

In [23]:
input_tensors, output_tensors = build_embeddings(feature_map_train)
input_tensors, output_tensors

({'user': <tf.Tensor 'user:0' shape=(None, None) dtype=float32>,
  'format': <tf.Tensor 'format:0' shape=(None, None) dtype=float32>,
  'token': <tf.Tensor 'token:0' shape=(None, None) dtype=float32>,
  'part_of_speech': <tf.Tensor 'part_of_speech:0' shape=(None, None) dtype=float32>},
 {'user': <tf.Tensor 'embedding_user/Identity:0' shape=(None, None, 256) dtype=float32>,
  'format': <tf.Tensor 'embedding_format/Identity:0' shape=(None, None, 8) dtype=float32>,
  'token': <tf.Tensor 'embedding_token/Identity:0' shape=(None, None, 256) dtype=float32>,
  'part_of_speech': <tf.Tensor 'embedding_part_of_speech/Identity:0' shape=(None, None, 64) dtype=float32>})

In [24]:
embeddings = [output_tensors[feature] for feature in features]
embeddings_tensors = layers.Concatenate(name="embedding_all_features")(embeddings)

In [25]:
def lstm_layers(X, units, dropout = 0.0, num_layers = 1):
    for j in range(num_layers):
        X = layers.LSTM(units = units, dropout = dropout, return_sequences = True, name = "LSTM_layer_{}".format(num_layers))(X)
    return X

In [26]:
X = lstm_layers(embeddings_tensors, 256)

In [27]:
def dense_layer(X, units, dropout = 0.0, activation = 'softmax'):
    X = layers.Dropout(dropout)(X)
    X = layers.Dense(units = units, activation = activation)(X)
    return X

In [29]:
X = dense_layer(X, 256, 0.2)
outputs = dense_layer(X, 3, 0.1, 'sigmoid')

In [30]:
inputs = [input_tensors[feature] for feature in features]
LSTM_model = models.Model(inputs = inputs, outputs = outputs)
LSTM_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
format (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
token (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
part_of_speech (InputLayer)     [(None, None)]       0                                            
______________________________________________________________________________________________

In [31]:
X_train = [indexed_train_data[feature] for feature in features]
X_test = [indexed_test_data[feature] for feature in features]
LSTM_model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [33]:
LSTM_model.fit(
    x = X_train,
    y = Y_train_oh,
    batch_size = 64,
    epochs = 15,
    validation_data = (X_test, Y_test_oh) 
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15

KeyboardInterrupt: 