# [Keras + Universal Sentence Encoder = Transfer Learning for text data](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) Tutorial
## Universal Sentence Encoder

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
#import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

Using TensorFlow backend.


In [2]:
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
df_train['mission_prgrm']=df_train['mission']+'; '+df_train['prgrm_dsc']

small_num=0
while small_num<100: # Make sure each category has at least 100 records.
    trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(40000)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    
#### Shuffle ####
trainDF = trainDF.sample(frac=1)

X = trainDF['mission_prgrm'].apply(lambda x: x.split())

Y = []
for one in X.values:
    if(len(one)<100):
        one = one + (['NaN']*(100-len(one)))
    else:
        one = one[:100]
    Y.append(one)

trainDF['missionprgrm'] = Y


In [3]:
df_train = pd.DataFrame()
df_train['text'] = trainDF['missionprgrm'][:35000]
df_train['label'] = trainDF['NTEE1'][:35000].astype('category')

df_train.head()

Unnamed: 0,text,label
1851452,"[TO, PROMOTE, THE, PROPAGATION, OF, WILDLIFE,,...",N
1643413,"[FRIENDS, SCHOOL, IS, AN, INDEPENDENT,, COEDUC...",B
1165595,"[Encourage, High, School, seniors, to, explore...",B
424733,"[TO, COORDINATE, AND, PROMOTE, THE, GROWTH, OF...",N
571969,"[TO, PROVIDE, RELIEF, TO, POOR,, DISTRESSED, A...",P


In [4]:
category_counts = len(df_train.label.cat.categories)
category_counts

25

## Wrap embed module in a Lambda layer
Explicitly cast the input as a string

In [5]:
train_text = df_train['text'].tolist()
train_text = np.asarray(train_text, dtype=object)

train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)
#train_text

In [6]:
train_text.shape

(35000, 100)

In [7]:
train_label.shape

(35000, 25)

In [8]:
df_test = pd.DataFrame()
df_test['text'] = trainDF['missionprgrm'][35000:]
df_test['label'] = trainDF['NTEE1'][35000:].astype('category')

df_test.head()

Unnamed: 0,text,label
1923689,"[IN, AN, ENVIRONMENT, OF, CHRISTIAN, LOVE, AND...",P
2282676,"[PROVIDE, A, MEETING, PLACE, FOR, AND, ASSISTA...",W
1121670,"[TO, RAISE, FUNDS, TO, ASSIST, OTHER, ONN-PROF...",W
1997700,"[PROVIDE, SUPPORT, TO, THE, SIGMA, ALPHA, EPSI...",B
2360642,"[TO, PROVIDE, HOUSING, TO, THE, ELDERLY, OR, D...",L


In [9]:
test_text = df_test['text'].tolist()
test_text = np.asarray(test_text, dtype=object)
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

In [10]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
embed = hub.Module(module_url, trainable=False, name='text_embedding')

embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value

SEQ_LENGTH = 100
EMBEDDING_DIM = embed_size

def Embed_sentence(sentences):
    batch_size = tf.shape(sentences)[0]
    flat_sentences = tf.reshape(sentences, [-1])
    embeddings = embed(tf.squeeze(tf.cast(flat_sentences, tf.string)), signature="default", as_dict=True)["default"]
    sentence_embedding = tf.reshape(embeddings, [batch_size, SEQ_LENGTH, EMBEDDING_DIM])
    return sentence_embedding
    

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


In [16]:
input_text = layers.Input(shape=(100,), dtype=tf.string)
embedding = layers.Lambda(Embed_sentence)(input_text)
lstm = layers.LSTM(units=512, return_sequences=True, activation='tanh')(embedding)
lstm2 = layers.LSTM(units=256, return_sequences=False, activation='tanh')(lstm)
drp = layers.Dropout(0.5)(lstm2)
pred = layers.Dense(128, activation='tanh')(drp)
#conv = layers.GRU(units=512, activation='tanh', return_sequences=True)(embedding)
#conv2 = layers.GRU(units=256, activation='tanh', return_sequences=True)(conv)
#conv3 = layers.GRU(units=128, activation='tanh', return_sequences=False)(conv2)
#drp = layers.Dropout(0.1)(conv3)
#dense = layers.Dense(128, activation='tanh')(drp)
pred1 = layers.Dense(category_counts, activation='softmax')(pred)
model = Model(inputs=[input_text], outputs=pred1)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
lambda_3 (Lambda)            (None, 100, 512)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100, 512)          2099200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 256)               787456    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (

## Train Keras model and save weights
This only train and save our Keras layers not the embed module' weights.

In [17]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  history = model.fit(train_text, 
            train_label,
            validation_data=(test_text, test_label),
            epochs=30,
            batch_size=1000)
  model.save_weights('./model.h5')

Train on 35000 samples, validate on 5000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
 1000/35000 [..............................] - ETA: 2:53 - loss: 1.3749 - acc: 0.6340

KeyboardInterrupt: 

In [45]:
!ls -alh | grep model.h5

-rw-rw-r-- 1 isha isha  12M Dec 20 02:46 model.h5


## Make predictions

In [None]:
new_text = ["Mission statement"]
new_text = np.array(new_text, dtype=object)[:, np.newaxis]
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights('./model.h5')  
  predicts = model.predict(new_text, batch_size=32)