In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing, utils, activations, optimizers
from tensorflow.keras import Input, layers, models, metrics

In [2]:
train_data = pd.read_pickle('../data_en_es/en_es_train_data.pkl')
test_data = pd.read_pickle('../data_en_es/en_es_test_data.pkl')

In [3]:
train_data['label'] = train_data['label'].astype(int).astype(str)
test_data['label'] = train_data['label'].astype(int).astype(str)
train_data['country'] = train_data['country'].astype(str)
test_data['country'] = test_data['country'].astype(str)

In [4]:
features = ['user', 'country', 'format', 'session', 'token', 'part_of_speech', 'dependency_label']

In [5]:
train_listings = pd.Series({feature : train_data[feature].tolist() for feature in features + ['label']})
test_listings = pd.Series({feature : train_data[feature].tolist() for feature in features + ['label']})

In [6]:
train_listings, test_listings

(user                [XEinXf5+, XEinXf5+, XEinXf5+, XEinXf5+, XEinX...
 country             [['CO'], ['CO'], ['CO'], ['CO'], ['CO'], ['CO'...
 format              [reverse_translate, reverse_translate, reverse...
 session             [lesson, lesson, lesson, lesson, lesson, lesso...
 token               [i, am, a, boy, i, am, from, mexico, my, name,...
 part_of_speech      [PRON, VERB, DET, NOUN, PRON, VERB, ADP, PROPN...
 dependency_label    [nsubj, cop, det, ROOT, nsubj, cop, case, ROOT...
 label               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
 dtype: object,
 user                [XEinXf5+, XEinXf5+, XEinXf5+, XEinXf5+, XEinX...
 country             [['CO'], ['CO'], ['CO'], ['CO'], ['CO'], ['CO'...
 format              [reverse_translate, reverse_translate, reverse...
 session             [lesson, lesson, lesson, lesson, lesson, lesso...
 token               [i, am, a, boy, i, am, from, mexico, my, name,...
 part_of_speech      [PRON, VERB, DET, NOUN, PRON, VERB, ADP,

In [7]:
train_data_listified = train_data.groupby('user')[features + ['label']].apply(lambda data: pd.Series({feature : data[feature].tolist() for feature in features + ['label']}))
test_data_listified = test_data.groupby('user')[features + ['label']].apply(lambda data: pd.Series({feature : data[feature].tolist() for feature in features + ['label']}))

In [8]:
train_data_listified

Unnamed: 0_level_0,user,country,format,session,token,part_of_speech,dependency_label,label
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
++j955YG,"[++j955YG, ++j955YG, ++j955YG, ++j955YG, ++j95...","[['MX'], ['MX'], ['MX'], ['MX'], ['MX'], ['MX'...","[reverse_translate, reverse_translate, reverse...","[lesson, lesson, lesson, lesson, lesson, lesso...","[am, i, a, boy, i, am, from, mexico, you, are,...","[ADP, PRON, DET, NOUN, PRON, VERB, ADP, PROPN,...","[ROOT, iobj, det, dobj, nsubj, cop, case, ROOT...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
+/iDvu/I,"[+/iDvu/I, +/iDvu/I, +/iDvu/I, +/iDvu/I, +/iDv...","[['BR', 'CL'], ['BR', 'CL'], ['BR', 'CL'], ['B...","[listen, reverse_tap, listen, listen, reverse_...","[lesson, lesson, lesson, lesson, lesson, lesso...","[what, what, good, morning, good, night, hello...","[PRON, PRON, ADJ, NOUN, ADJ, NOUN, INTJ, CONJ,...","[ROOT, ROOT, amod, ROOT, amod, ROOT, ROOT, cc,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+0UEF02n,"[+0UEF02n, +0UEF02n, +0UEF02n, +0UEF02n, +0UEF...","[['MX'], ['MX'], ['MX'], ['MX'], ['MX'], ['MX'...","[reverse_tap, listen, reverse_tap, reverse_tap...","[lesson, lesson, lesson, lesson, lesson, lesso...","[what, what, hello, and, good, morning, good, ...","[PRON, PRON, INTJ, CONJ, ADJ, NOUN, ADJ, NOUN,...","[ROOT, ROOT, ROOT, cc, amod, conj, amod, ROOT,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
+197nchq,"[+197nchq, +197nchq, +197nchq, +197nchq, +197n...","[['US'], ['US'], ['US'], ['US'], ['US'], ['US'...","[reverse_tap, listen, listen, listen, reverse_...","[lesson, lesson, lesson, lesson, lesson, lesso...","[what, what, good, morning, good, evening, hel...","[PRON, PRON, ADJ, NOUN, ADJ, NOUN, INTJ, CONJ,...","[ROOT, ROOT, amod, ROOT, amod, ROOT, ROOT, cc,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+7lbKZrn,"[+7lbKZrn, +7lbKZrn, +7lbKZrn, +7lbKZrn, +7lbK...","[['US'], ['US'], ['US'], ['US'], ['US'], ['US'...","[reverse_translate, listen, listen, listen, re...","[lesson, lesson, lesson, lesson, lesson, lesso...","[what, what, good, morning, good, night, good,...","[PRON, PRON, ADJ, NOUN, ADJ, NOUN, ADJ, NOUN, ...","[ROOT, ROOT, amod, ROOT, amod, ROOT, amod, ROO...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, ..."
...,...,...,...,...,...,...,...,...
zv3rQx2W,"[zv3rQx2W, zv3rQx2W, zv3rQx2W, zv3rQx2W, zv3rQ...","[['MX'], ['MX'], ['MX'], ['MX'], ['MX'], ['MX'...","[listen, listen, listen, listen, listen, liste...","[lesson, lesson, lesson, lesson, lesson, lesso...","[i, speak, english, i, am, sorry, where, i, am...","[PRON, VERB, PROPN, PRON, VERB, ADJ, ADV, PRON...","[nsubj, ROOT, dobj, nsubj, cop, ROOT, ROOT, ns...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
zx+JF92P,"[zx+JF92P, zx+JF92P, zx+JF92P, zx+JF92P, zx+JF...","[['SV'], ['SV'], ['SV'], ['SV'], ['SV'], ['SV'...","[reverse_translate, listen, listen, listen, li...","[lesson, lesson, lesson, lesson, lesson, lesso...","[what, what, i, am, very, good, please, i, am,...","[PRON, PRON, PRON, VERB, ADV, ADJ, INTJ, PRON,...","[ROOT, ROOT, nsubj, cop, advmod, ROOT, ROOT, n...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
zyRFO2/B,"[zyRFO2/B, zyRFO2/B, zyRFO2/B, zyRFO2/B, zyRFO...","[['ES'], ['ES'], ['ES'], ['ES'], ['ES'], ['ES'...","[listen, listen, listen, listen, listen, liste...","[lesson, lesson, lesson, lesson, lesson, lesso...","[i, am, from, mexico, are, you, from, mexico, ...","[PRON, VERB, ADP, PROPN, VERB, PRON, ADP, PROP...","[nsubj, cop, case, ROOT, cop, nsubj, case, ROO...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
zzZcZM/K,"[zzZcZM/K, zzZcZM/K, zzZcZM/K, zzZcZM/K, zzZcZ...","[['PE'], ['PE'], ['PE'], ['PE'], ['PE'], ['PE'...","[reverse_tap, listen, reverse_tap, reverse_tap...","[lesson, lesson, practice, practice, lesson, l...","[what, what, a, man, good, morning, good, even...","[PRON, PRON, DET, NOUN, ADJ, NOUN, ADJ, NOUN, ...","[ROOT, ROOT, det, ROOT, amod, ROOT, amod, ROOT...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
# Create index to string mappings for the features, include an index mapping for padding
# so padding will have index of 0
def feature_mapping(data, pad = "_PAD_"):
    feature_map = {}
    for var in features + ['label']:
        feature_map[var] = {}
        unique_features = list(set(data[var]))
        unique_features.insert(0, pad)

        tokens_to_index = {feature: index for index, feature in enumerate(unique_features)}
        feature_map[var] = tokens_to_index

    return feature_map



In [10]:
feature_map_train = feature_mapping(train_data)
feature_map_test = feature_mapping(test_data)

In [11]:
# Vocab sizes
print({var: len(feature_map_train[var]) for var in feature_map_train})
print({var: len(feature_map_test[var]) for var in feature_map_test})

{'user': 2594, 'country': 65, 'format': 4, 'session': 4, 'token': 1968, 'part_of_speech': 17, 'dependency_label': 42, 'label': 3}
{'user': 2594, 'country': 65, 'format': 4, 'session': 4, 'token': 1880, 'part_of_speech': 17, 'dependency_label': 41, 'label': 3}


In [12]:
max_length_train = train_data['user'].value_counts().max()
max_length_test = test_data['user'].value_counts().max()
max_length_train, max_length_test

(8894, 1057)

In [13]:
# We'll just pad to the length of the longest individual sequence (apparently thats 8894)
def add_padding(sequences, feature_map, maxlen = None):
    index = [[feature_map[feature] for feature in sequence] for sequence in sequences]
    index = preprocessing.sequence.pad_sequences(index, maxlen, value = feature_map["_PAD_"])
    return index

In [14]:
indexed_train_data = {var: add_padding(train_data_listified[var], feature_map_train[var], 2048) for var in feature_map_train}
Y_train = indexed_train_data.pop('label')

indexed_test_data = {var: add_padding(test_data_listified[var], feature_map_test[var], 1024) for var in feature_map_test}
Y_test = indexed_test_data.pop('label')

In [15]:
Y_train_oh = utils.to_categorical(Y_train)
Y_test_oh = utils.to_categorical(Y_test)

In [16]:
Y_train.shape, Y_test.shape

((2593, 2048), (2593, 1024))

In [17]:
indexed_train_data['user'].shape, indexed_test_data['user'].shape

((2593, 2048), (2593, 1024))

In [18]:
Y_train_oh.shape, Y_test_oh.shape

((2593, 2048, 3), (2593, 1024, 3))

In [19]:
def create_embeddings(input_dim, output_dim, input_length = None, name = None):
    input_tensor = Input(shape = (input_length, ), name = name)
    embedding_layer = layers.Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length, mask_zero = True, name = "embedding_{}".format(name))(input_tensor)

    return input_tensor, embedding_layer

In [20]:
def build_embeddings(feature_map, max_length = None):
    input_tensors = {}
    output_tensors = {}

    input_dims = {feature: len(feature_map[feature]) for feature in feature_map}
    
    output_dims = {}
    for feature in features:
        if len(feature_map[feature]) < 10:
            output_dims[feature] = 8
        if 10 <= len(feature_map[feature]) < 1000:
            output_dims[feature] = 64
        elif len(feature_map[feature]) >= 1000:
            output_dims[feature] = 256
  
    for feature in features:
        input_tensors[feature], output_tensors[feature] = create_embeddings(input_dims[feature], output_dims[feature], max_length, feature)

    return input_tensors, output_tensors

In [21]:
input_tensors, output_tensors = build_embeddings(feature_map_train)
input_tensors, output_tensors

({'user': <tf.Tensor 'user:0' shape=(None, None) dtype=float32>,
  'country': <tf.Tensor 'country:0' shape=(None, None) dtype=float32>,
  'format': <tf.Tensor 'format:0' shape=(None, None) dtype=float32>,
  'session': <tf.Tensor 'session:0' shape=(None, None) dtype=float32>,
  'token': <tf.Tensor 'token:0' shape=(None, None) dtype=float32>,
  'part_of_speech': <tf.Tensor 'part_of_speech:0' shape=(None, None) dtype=float32>,
  'dependency_label': <tf.Tensor 'dependency_label:0' shape=(None, None) dtype=float32>},
 {'user': <tf.Tensor 'embedding_user/Identity:0' shape=(None, None, 256) dtype=float32>,
  'country': <tf.Tensor 'embedding_country/Identity:0' shape=(None, None, 64) dtype=float32>,
  'format': <tf.Tensor 'embedding_format/Identity:0' shape=(None, None, 8) dtype=float32>,
  'session': <tf.Tensor 'embedding_session/Identity:0' shape=(None, None, 8) dtype=float32>,
  'token': <tf.Tensor 'embedding_token/Identity:0' shape=(None, None, 256) dtype=float32>,
  'part_of_speech': <tf.

In [22]:
embeddings = [output_tensors[feature] for feature in features]
embeddings_tensors = layers.Concatenate(name="embedding_all_features")(embeddings)

In [23]:
def lstm_layers(X, units, dropout = 0.0, num_layers = 1):
    for j in range(num_layers):
        X = layers.LSTM(units = units, dropout = dropout, return_sequences = True, name = "LSTM_layer_{}".format(num_layers))(X)
    return X

In [24]:
X = lstm_layers(embeddings_tensors, 256)

In [25]:
def dense_layer(X, units, dropout = 0.0, activation = 'softmax'):
    X = layers.Dropout(dropout)(X)
    X = layers.Dense(units = units, activation = activation)(X)
    return X

In [26]:
X = dense_layer(X, 256, 0.2)
outputs = dense_layer(X, 3, 0.1, 'sigmoid')

In [27]:
inputs = [input_tensors[feature] for feature in features]
LSTM_model = models.Model(inputs = inputs, outputs = outputs)
LSTM_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
country (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
format (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
session (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [28]:
X_train = [indexed_train_data[feature] for feature in features]
X_test = [indexed_test_data[feature] for feature in features]
LSTM_model.compile(optimizer = 'Adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])


In [29]:
LSTM_history = LSTM_model.fit(
    x = X_train,
    y = Y_train_oh,
    batch_size = 64,
    epochs = 10,
    validation_data = (X_test, Y_test_oh) 
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [31]:
LSTM_model.save('LSTM_model_en_es')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: LSTM_model_en_es/assets
