# PLAN

## 1) Elmo embeddings in Keras - avec les données de l'exemple

## 2) Elmo embeddings in Keras - avec les données Toxic Comment

## 3) Based on another example code

## 4) Actually works, adapted from 3) ...

# 1) Elmo embeddings in Keras - avec les données de l'exemple

https://towardsdatascience.com/elmo-embeddings-in-keras-with-tensorflow-hub-7eb6f0145440

goes with https://github.com/strongio/keras-elmo/blob/master/Elmo%20Keras.ipynb

importing https://tfhub.dev/google/elmo/2

In [1]:
# Import our dependencies
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np

# Initialize session
sess = tf.Session()
K.set_session(sess)

Using TensorFlow backend.


### Chargement des données

In [2]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))

  return train_df, test_df

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Unnamed: 0,sentence,sentiment,polarity
0,Humphrey Bogart clearly did not want to be in ...,3,0
1,"*SPOILERS* Four men, Ed (Jon Voight), Lewis (B...",9,1
2,I can honestly say I never expected this movie...,8,1
3,This film came recommended as a good action fi...,4,0
4,"Dear Richard, I know we all loved you on Home ...",1,0


### Définition du modèle

In [23]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        print(input_shape[0])
        return (input_shape[0], self.dimensions)

In [24]:
# Function to build model

def build_model(): 
  input_text = layers.Input(shape=(1,), dtype="string")
  embedding = ElmoEmbeddingLayer()(input_text)
  dense = layers.Dense(256, activation='relu')(embedding)
  pred = layers.Dense(1, activation='sigmoid')(dense)

  model = Model(inputs=[input_text], outputs=pred)

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  
  return model

In [5]:
# Create datasets (Only take up to 150 words for memory)
train_text = train_df['sentence'].tolist()
train_text = [' '.join(t.split()[0:150]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df['polarity'].tolist()

test_text = test_df['sentence'].tolist()
test_text = [' '.join(t.split()[0:150]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df['polarity'].tolist()

In [8]:
type(train_text)

numpy.ndarray

In [14]:
train_text

array([['Humphrey Bogart clearly did not want to be in this film, and be forced to play a part-Mexican or he would have been suspended. Believe me , he made the wrong choice! Presumably, after the success of "Dodge City", Warners tried a follow-up with Errol Flynn and his usual list of buddies, like Alan Hale, Guinn (Big Boy) Williams, Frank Mc Hugh and the ever-present John Litel, but they made the huge mistake of trying to present Miriam Hopkins as a love interest for Flynn v. Randolph Scott, and as a singer to really make things bad, because she proved one thing, and that is she cannot sing. The story was not too bad, but with Bogie clearly miscast also, it turned out to be a poor Western that was overlong, and on a low budget, but in fairness, color would not have helped.'],
       ["*SPOILERS* Four men, Ed (Jon Voight), Lewis (Burt Reynolds), Drew (Ronny Cox) and Bobby (Ned Beatty), decide to go on a rafting trip on the Cahulawassee river, before it is flooded.<br /><br />They wan

In [9]:
train_text.shape

(25000, 1)

In [13]:
train_text[0,0]

'Humphrey Bogart clearly did not want to be in this film, and be forced to play a part-Mexican or he would have been suspended. Believe me , he made the wrong choice! Presumably, after the success of "Dodge City", Warners tried a follow-up with Errol Flynn and his usual list of buddies, like Alan Hale, Guinn (Big Boy) Williams, Frank Mc Hugh and the ever-present John Litel, but they made the huge mistake of trying to present Miriam Hopkins as a love interest for Flynn v. Randolph Scott, and as a singer to really make things bad, because she proved one thing, and that is she cannot sing. The story was not too bad, but with Bogie clearly miscast also, it turned out to be a poor Western that was overlong, and on a low budget, but in fairness, color would not have helped.'

In [16]:
len(train_label)

25000

In [22]:
train_label[:40]

[0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1]

In [25]:
# Build and fit
model = build_model()

None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_2 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 262,661
Trainable params: 262,661
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.fit(train_text, 
          train_label,
          validation_data=(test_text, test_label),
          epochs=1,
          batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 262,661
Trainable params: 262,661
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
   64/25000 [..............................] - ETA: 5:11:29 - loss: 0.7168 - acc: 0.5000

KeyboardInterrupt: 

# 2) Elmo embeddings in Keras - avec les données Toxic Comment

In [1]:
# Import à nous
import numpy as np
np.random.seed(42)
from keras.preprocessing import text, sequence

from tools import *
from embeddings import *
from models import *


# Import du tutorial sur Elmo
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np

# Initialize session
sess = tf.Session()
K.set_session(sess)

Using TensorFlow backend.


## Import des données

In [2]:
# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

In [3]:
# Create datasets (Only take up to 150 words for memory)
train_text = [' '.join(t.split()[0:150]) for t in data_train]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = y_train_all.tolist()

test_text = [' '.join(t.split()[0:150]) for t in data_test]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

# pas ici, on se passe de validation_data pour model
# test_label = test_df['polarity'].tolist()

## Définition du modèle

In [4]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
#         return (input_shape[0], self.dimensions)
        return (1, self.dimensions)

In [5]:
# Function to build model

def build_model(): 
  input_text = layers.Input(shape=(1,), dtype="string")
  embedding = ElmoEmbeddingLayer()(input_text)
  dense = layers.Dense(256, activation='relu')(embedding)
  pred = layers.Dense(6, activation='sigmoid')(dense)

  model = Model(inputs=[input_text], outputs=pred)

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  
  return model

In [6]:
# model.fit(train_text, 
#           train_label,
#           validation_data=(test_text, test_label),
#           epochs=1,
#           batch_size=32)

# on n'a pas le test label nous, donc pas de validation_data
model = build_model()
model.fit(train_text, 
          train_label,
          epochs=1,
          batch_size=32)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (1, 1024)                 4         
_________________________________________________________________
dense_1 (Dense)              (1, 256)                  262400    
_________________________________________________________________
dense_2 (Dense)              (1, 6)                    1542      
Total params: 263,946
Trainable params: 263,946
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 159571 arrays: [array([[0],
       [0],
       [0],
       [0],
       [0],
       [0]]), array([[0],
       [0],
       [0],
       [0],
       [0],
       [0]]), array([[0],
       [0],
       [0],
       [0],
   ...

# 3) Based on another example code

https://github.com/tensorflow/hub/issues/149

https://github.com/PrashantRanjan09/Elmo-Tutorial

https://github.com/PrashantRanjan09/WordEmbeddings-Elmo-Fasttext-Word2Vec/blob/master/word_embeddings.py

In [1]:
# Import à nous
import numpy as np
np.random.seed(42)
from keras.preprocessing import text, sequence

from tools import *
from embeddings import *
from models import *


# Import du tutorial sur Elmo
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np

# Initialize session
sess = tf.Session()
K.set_session(sess)

Using TensorFlow backend.


## Chargement des données

In [2]:
# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

In [3]:
# Create datasets (Only take up to 150 words for memory)
train_text = [' '.join(t.split()[0:150]) for t in data_train]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = y_train_all.tolist()

test_text = [' '.join(t.split()[0:150]) for t in data_test]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

## Elmo

In [4]:
def ELMoEmbedding(x):
    elmo_model = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
    return elmo_model(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [5]:
input_text = layers.Input(shape=(1,), dtype=tf.string)
embed_seq = layers.Lambda(ELMoEmbedding, output_shape=(1024,))(input_text)
x = Dense(256,activation ="relu")(embed_seq)
preds = Dense(6,activation="sigmoid")(x)

model = Model(input_text,preds)
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [6]:
# model.fit(X_train_pad,y_train,epochs=10,batch_size=512,validation_data=(X_test_pad,y_test))

model.fit(train_text,train_label,epochs=2,batch_size=512)

ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 159571 arrays: [array([[0],
       [0],
       [0],
       [0],
       [0],
       [0]]), array([[0],
       [0],
       [0],
       [0],
       [0],
       [0]]), array([[0],
       [0],
       [0],
       [0],
   ...

In [None]:
predictions = model.predict(X_test_pad)
predictions = [0 if i<0.5 else 1 for i in predictions]
print("Accuracy: ",accuracy_score(y_test,predictions))
print("Classification Report: ",classification_report(y_test,predictions))

# 4) Actually works, adapted from 3) ...

In [None]:
# Import à nous
import numpy as np
np.random.seed(42)
from keras.preprocessing import text, sequence

from tools import *
from embeddings import *
from models import *


# Import du tutorial sur Elmo
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np

# Initialize session
sess = tf.Session()
K.set_session(sess)

# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

# Create datasets (Only take up to 150 words for memory)
train_text = [' '.join(t.split()[0:150]) for t in data_train]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = y_train_all

test_text = [' '.join(t.split()[0:150]) for t in data_test]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

def ELMoEmbedding(x):
    elmo_model = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
    return elmo_model(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

input_text = layers.Input(shape=(1,), dtype=tf.string)
embed_seq = layers.Lambda(ELMoEmbedding, output_shape=(1024,))(input_text)
x = Dense(256,activation ="relu")(embed_seq)
preds = Dense(6,activation="sigmoid")(x)

model = Model(input_text,preds)
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

# model.fit(X_train_pad,y_train,epochs=10,batch_size=512,validation_data=(X_test_pad,y_test))

model.fit(train_text,train_label,epochs=2,batch_size=32)

# save trained nnet to disk for later use
MODEL_NAME = "elmo_sentence150_elmo1_dense256"
save_nnet(model, MODEL_NAME)

# predict
y_test_pred = model.predict(test_text, batch_size=1024)

# write submission file
submission(y_test_pred, id_test, name=MODEL_NAME)