In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from eval import *
from tensorflow.keras import preprocessing, utils, activations, optimizers
from tensorflow.keras import Input, layers, models, metrics
from sklearn.metrics import roc_curve,roc_auc_score

In [2]:
train_data = pd.read_pickle('../data_fr_en/fr_en_train_data.pkl')
test_data = pd.read_pickle('../data_fr_en/fr_en_test_data.pkl')

In [3]:
train_data['label'] = train_data['label'].astype(int).astype(str)
test_data['label'] = train_data['label'].astype(int).astype(str)
train_data['country'] = train_data['country'].astype(str)
test_data['country'] = test_data['country'].astype(str)

In [4]:
features = ['user', 'country', 'format', 'session', 'token', 'part_of_speech', 'dependency_label']

In [5]:
train_listings = pd.Series({feature : train_data[feature].tolist() for feature in features + ['label']})
test_listings = pd.Series({feature : train_data[feature].tolist() for feature in features + ['label']})

In [6]:
train_listings, test_listings

(user                [YjS/mQOx, YjS/mQOx, YjS/mQOx, YjS/mQOx, YjS/m...
 country             [['CA'], ['CA'], ['CA'], ['CA'], ['CA'], ['CA'...
 format              [reverse_translate, reverse_translate, reverse...
 session             [lesson, lesson, lesson, lesson, lesson, lesso...
 token               [le, garçon, je, suis, une, femme, la, fille, ...
 part_of_speech      [DET, NOUN, PRON, VERB, DET, NOUN, DET, NOUN, ...
 dependency_label    [det, ROOT, nsubj, cop, det, ROOT, det, ROOT, ...
 label               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
 dtype: object,
 user                [YjS/mQOx, YjS/mQOx, YjS/mQOx, YjS/mQOx, YjS/m...
 country             [['CA'], ['CA'], ['CA'], ['CA'], ['CA'], ['CA'...
 format              [reverse_translate, reverse_translate, reverse...
 session             [lesson, lesson, lesson, lesson, lesson, lesso...
 token               [le, garçon, je, suis, une, femme, la, fille, ...
 part_of_speech      [DET, NOUN, PRON, VERB, DET, NOUN, DET, 

In [7]:
train_data_listified = train_data.groupby('user')[features + ['label']].apply(lambda data: pd.Series({feature : data[feature].tolist() for feature in features + ['label']}))
test_data_listified = test_data.groupby('user')[features + ['label']].apply(lambda data: pd.Series({feature : data[feature].tolist() for feature in features + ['label']}))

In [8]:
train_data_listified

Unnamed: 0_level_0,user,country,format,session,token,part_of_speech,dependency_label,label
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
++AP6DT7,"[++AP6DT7, ++AP6DT7, ++AP6DT7, ++AP6DT7, ++AP6...","[['TR'], ['TR'], ['TR'], ['TR'], ['TR'], ['TR'...","[reverse_tap, reverse_tap, reverse_tap, revers...","[lesson, lesson, lesson, lesson, lesson, lesso...","[je, suis, rouge, je, suis, riche, je, mange, ...","[PRON, VERB, ADJ, PRON, VERB, ADJ, PRON, VERB,...","[nsubj, cop, ROOT, nsubj, cop, ROOT, nsubj, RO...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+0Dxf1dW,"[+0Dxf1dW, +0Dxf1dW, +0Dxf1dW, +0Dxf1dW, +0Dxf...","[['US'], ['US'], ['US'], ['US'], ['US'], ['US'...","[reverse_translate, reverse_translate, reverse...","[lesson, lesson, lesson, lesson, lesson, lesso...","[la, femme, le, garçon, je, suis, rouge, je, s...","[DET, NOUN, DET, NOUN, PRON, VERB, ADJ, PRON, ...","[det, ROOT, det, ROOT, nsubj, cop, ROOT, nsubj...","[0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
+4kwmfjD,"[+4kwmfjD, +4kwmfjD, +4kwmfjD, +4kwmfjD, +4kwm...","[['AU'], ['AU'], ['AU'], ['AU'], ['AU'], ['AU'...","[reverse_translate, reverse_translate, reverse...","[lesson, lesson, lesson, lesson, lesson, lesso...","[la, femme, je, suis, rouge, l', homme, je, su...","[DET, NOUN, PRON, VERB, ADJ, DET, NOUN, PRON, ...","[det, ROOT, nsubj, cop, ROOT, det, ROOT, nsubj...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+5fX8ua2,"[+5fX8ua2, +5fX8ua2, +5fX8ua2, +5fX8ua2, +5fX8...","[['CA'], ['CA'], ['CA'], ['CA'], ['CA'], ['CA'...","[reverse_tap, reverse_tap, reverse_tap, revers...","[lesson, lesson, lesson, lesson, lesson, lesso...","[je, suis, riche, je, suis, rouge, l', homme, ...","[PRON, VERB, ADJ, PRON, VERB, ADJ, DET, NOUN, ...","[nsubj, cop, ROOT, nsubj, cop, ROOT, det, ROOT...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
+8m/uw4+,"[+8m/uw4+, +8m/uw4+, +8m/uw4+, +8m/uw4+, +8m/u...","[['US'], ['US'], ['US'], ['US'], ['US'], ['US'...","[reverse_tap, reverse_tap, reverse_tap, revers...","[lesson, lesson, lesson, lesson, lesson, lesso...","[je, suis, rouge, je, suis, riche, je, mange, ...","[PRON, VERB, ADJ, PRON, VERB, ADJ, PRON, VERB,...","[nsubj, cop, ROOT, nsubj, cop, ROOT, nsubj, RO...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
zdwKXRSt,"[zdwKXRSt, zdwKXRSt, zdwKXRSt, zdwKXRSt, zdwKX...","[['GB'], ['GB'], ['GB'], ['GB'], ['GB'], ['GB'...","[reverse_tap, reverse_tap, reverse_tap, revers...","[lesson, lesson, lesson, lesson, lesson, lesso...","[la, femme, je, suis, rouge, je, suis, riche, ...","[DET, NOUN, PRON, VERB, ADJ, PRON, VERB, ADJ, ...","[det, ROOT, nsubj, cop, ROOT, nsubj, cop, ROOT...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
zg2Sbl5B,"[zg2Sbl5B, zg2Sbl5B, zg2Sbl5B, zg2Sbl5B, zg2Sb...","[['EG'], ['EG'], ['EG'], ['EG'], ['EG'], ['EG'...","[reverse_tap, reverse_tap, reverse_tap, revers...","[lesson, lesson, lesson, lesson, lesson, lesso...","[la, femme, le, garçon, une, pomme, je, suis, ...","[DET, NOUN, DET, NOUN, DET, NOUN, PRON, VERB, ...","[det, ROOT, det, ROOT, det, ROOT, nsubj, cop, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
zgWENKZt,"[zgWENKZt, zgWENKZt, zgWENKZt, zgWENKZt, zgWEN...","[['KE'], ['KE'], ['KE'], ['KE'], ['KE'], ['KE'...","[reverse_tap, reverse_tap, reverse_tap, revers...","[lesson, lesson, lesson, lesson, lesson, lesso...","[la, femme, je, suis, rouge, je, suis, riche, ...","[DET, NOUN, PRON, VERB, ADJ, PRON, VERB, ADJ, ...","[det, ROOT, nsubj, cop, ROOT, nsubj, cop, ROOT...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
zmKqEwIw,"[zmKqEwIw, zmKqEwIw, zmKqEwIw, zmKqEwIw, zmKqE...","[['US'], ['US'], ['US'], ['US'], ['US'], ['US'...","[reverse_translate, reverse_translate, reverse...","[lesson, lesson, lesson, lesson, lesson, lesso...","[une, pomme, l', homme, je, suis, rouge, je, s...","[DET, NOUN, DET, NOUN, PRON, VERB, ADJ, PRON, ...","[det, ROOT, det, ROOT, nsubj, cop, ROOT, nsubj...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
# Create index to string mappings for the features, include an index mapping for padding
# so padding will have index of 0
def feature_mapping(data, pad = "_PAD_"):
    feature_map = {}
    for var in features + ['label']:
        feature_map[var] = {}
        unique_features = list(set(data[var]))
        unique_features.insert(0, pad)

        tokens_to_index = {feature: index for index, feature in enumerate(unique_features)}
        feature_map[var] = tokens_to_index

    return feature_map



In [10]:
feature_map_train = feature_mapping(train_data)
feature_map_test = feature_mapping(test_data)

In [11]:
# Vocab sizes
print({var: len(feature_map_train[var]) for var in feature_map_train})
print({var: len(feature_map_test[var]) for var in feature_map_test})

{'user': 1214, 'country': 134, 'format': 4, 'session': 4, 'token': 1941, 'part_of_speech': 17, 'dependency_label': 33, 'label': 3}
{'user': 1207, 'country': 134, 'format': 4, 'session': 4, 'token': 1707, 'part_of_speech': 17, 'dependency_label': 33, 'label': 3}


In [12]:
max_length_train = train_data['user'].value_counts().max()
max_length_test = test_data['user'].value_counts().max()
max_length_train, max_length_test

(7675, 1055)

In [13]:
# We'll just pad to the length of the longest individual sequence (apparently thats 8894)
def add_padding(sequences, feature_map, maxlen = None):
    index = [[feature_map[feature] for feature in sequence] for sequence in sequences]
    index = preprocessing.sequence.pad_sequences(index, maxlen, value = feature_map["_PAD_"])
    return index

In [14]:
indexed_train_data = {var: add_padding(train_data_listified[var], feature_map_train[var], 2048) for var in feature_map_train}
Y_train = indexed_train_data.pop('label')

indexed_test_data = {var: add_padding(test_data_listified[var], feature_map_test[var]) for var in feature_map_test}
Y_test = indexed_test_data.pop('label')

In [15]:
Y_train_oh = utils.to_categorical(Y_train)
Y_test_oh = utils.to_categorical(Y_test)

In [16]:
indexed_train_data['user'].shape, indexed_test_data['user'].shape

((1213, 2048), (1206, 1055))

In [17]:
Y_train_oh.shape, Y_test_oh.shape

((1213, 2048, 3), (1206, 1055, 3))

In [18]:
def create_embeddings(input_dim, output_dim, input_length = None, name = None):
    input_tensor = Input(shape = (input_length, ), name = name)
    embedding_layer = layers.Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length, mask_zero = True, name = "embedding_{}".format(name))(input_tensor)

    return input_tensor, embedding_layer

In [19]:
def build_embeddings(feature_map, max_length = None):
    input_tensors = {}
    output_tensors = {}

    input_dims = {feature: len(feature_map[feature]) for feature in feature_map}
    
    output_dims = {}
    for feature in features:
        if len(feature_map[feature]) < 10:
            output_dims[feature] = 8
        if 10 <= len(feature_map[feature]) < 1000:
            output_dims[feature] = 64
        elif len(feature_map[feature]) >= 1000:
            output_dims[feature] = 256
  
    for feature in features:
        input_tensors[feature], output_tensors[feature] = create_embeddings(input_dims[feature], output_dims[feature], max_length, feature)

    return input_tensors, output_tensors

In [20]:
input_tensors, output_tensors = build_embeddings(feature_map_train)
input_tensors, output_tensors

({'user': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'user')>,
  'country': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'country')>,
  'format': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'format')>,
  'session': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'session')>,
  'token': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'token')>,
  'part_of_speech': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'part_of_speech')>,
  'dependency_label': <KerasTensor: shape=(None, None) dtype=float32 (created by layer 'dependency_label')>},
 {'user': <KerasTensor: shape=(None, None, 256) dtype=float32 (created by layer 'embedding_user')>,
  'country': <KerasTensor: shape=(None, None, 64) dtype=float32 (created by layer 'embedding_country')>,
  'format': <KerasTensor: shape=(None, None, 8) dtype=float32 (created by layer 'embedding_format')>,
  'session': <KerasTensor: shape=(N

In [21]:
embeddings = [output_tensors[feature] for feature in features]
embeddings_tensors = layers.Concatenate(name="embedding_all_features")(embeddings)

In [22]:
def lstm_layers(X, units, dropout = 0.0, num_layers = 1):
    for j in range(num_layers):
        X = layers.LSTM(units = units, dropout = dropout, return_sequences = True, name = "LSTM_layer_{}".format(num_layers))(X)
    return X

In [23]:
X = lstm_layers(embeddings_tensors, 256)

In [24]:
def dense_layer(X, units, dropout, activation):
    X = layers.Dropout(dropout)(X)
    X = layers.Dense(units = units, activation = activation)(X)
    return X

In [25]:
X = dense_layer(X, 256, 0.2, 'softmax')
outputs = dense_layer(X, 3, 0.1, 'sigmoid')

In [26]:
inputs = [input_tensors[feature] for feature in features]
LSTM_model = models.Model(inputs = inputs, outputs = outputs)
LSTM_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
country (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
format (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
session (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [27]:
X_train = [indexed_train_data[feature] for feature in features]
X_test = [indexed_test_data[feature] for feature in features]

optimizer = optimizers.Adam(learning_rate = 0.001)
LSTM_model.compile(optimizer = optimizer, loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])


In [28]:
callbacks = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
LSTM_history = LSTM_model.fit(
    x = X_train,
    y = Y_train_oh,
    batch_size = 64,
    epochs = 20,
    validation_data = (X_test, Y_test_oh),
    shuffle = True,
    callbacks = [callbacks]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
LSTM_model.save('LSTM_model_fr_en')

INFO:tensorflow:Assets written to: LSTM_model_fr_en/assets
INFO:tensorflow:Assets written to: LSTM_model_fr_en/assets


In [30]:
# LSTM_model = models.load_model('LSTM_model_en_es/')
Y_test_pred = LSTM_model.predict(X_test)

In [50]:
def evaluate_model(Y, Y_pred):
    Y_condensed = Y.reshape(-1, 3)
    Y_pred_condensed = Y_pred.reshape(-1, 3)
    
   
    # If first vertical axis is a 0, then there's padding
    padded = np.where(Y_condensed[:, 0] == 0)[0]
    Y_condensed = Y_condensed[padded, :]
    Y_pred_condensed = Y_pred_condensed[padded, :]

    test_metrics()
    metrics = evaluate_metrics(Y_condensed[:, -1], Y_pred_condensed[:, -1])

    return metrics

In [51]:
metrics = evaluate_model(Y_test_oh, Y_test_pred)

Verified that our environment is calculating metrics correctly.


In [52]:
metrics

{'accuracy': 0.08922763748398607,
 'avglogloss': 0.8359590496177308,
 'auroc': 0.9905292502086208,
 'F1': 0.16383652858844744}

In [33]:
f = open("fr_en_test_results.txt", 'w')
f.write(str(metrics))
f.close()