# [Keras + Universal Sentence Encoder = Transfer Learning for text data](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) Tutorial
## Universal Sentence Encoder

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

Using TensorFlow backend.


In [4]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

embed = hub.Module(module_url)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


In [5]:
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
embed_size

512

In [7]:
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
df_train['mission_prgrm']=df_train['mission']+'; '+df_train['prgrm_dsc']

small_num=0
while small_num<100: # Make sure each category has at least 100 records.
    trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(40000)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    
#### Shuffle ####
trainDF = trainDF.sample(frac=1)

df_train = pd.DataFrame()
df_train['text'] = trainDF['mission_prgrm'][:35000]
df_train['label'] = trainDF['NTEE1'][:35000].astype('category')

df_train.head()


Unnamed: 0,text,label
66185,DISASTER RELIEF FOR HAITI. THIS IS AN OFFSHOOT...,X
482050,PROVIDE AFFORDABLE AND SAFE HOUSING FOR ADULTS...,L
89888,Promote literacy through book ownership; For t...,B
122953,To advance the art and craft of writing by enc...,A
580623,DEFEND AND EDUCATE MEMBERSHIP; DEFENDED THE ME...,J


In [8]:
category_counts = len(df_train.label.cat.categories)
category_counts
df_train.label.cat.categories

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'],
      dtype='object')

## Wrap embed module in a Lambda layer
Explicitly cast the input as a string

In [9]:
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [10]:
train_text = df_train['text'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)

In [11]:
train_text.shape

(35000, 1)

In [12]:
train_label.shape

(35000, 25)

In [13]:
df_test = pd.DataFrame()
df_test['text'] = trainDF['mission_prgrm'][35000:]
df_test['label'] = trainDF['NTEE1'][35000:].astype('category')

df_test.head()

Unnamed: 0,text,label
1608050,UCP OF CENTRAL FLORIDA IS A VOLUNTARY HEALTH A...,G
1371450,Educating Today ... Preserving for TomorrowOur...,C
2191610,TO SUPPORT THE CHARITABLE AND EDUCATIONAL ACTI...,P
35700,"THE MISSION OF MIDVALE ARTS COUNCIL, INC. IS T...",A
9972,OUR ORGANIZATION PROVIDES FOR THE OPERATION AN...,M


In [14]:
test_text = df_test['text'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

In [22]:
'''
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
embedding1 = layers.Embedding(input_dim = 1000, output_dim = 512)(embedding)
lstm = layers.GRU(units=512, activation='tanh', return_sequences=False)(embedding1)
drp = layers.Dropout(0.2)(lstm)
dense = layers.Dense(256, activation='tanh')(drp)
pred = layers.Dense(category_counts, activation='softmax')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
'''

input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
dense1 = layers.Dense(512, activation='tanh')(embedding)
dense2 = layers.Dense(256, activation='tanh')(dense1)
dense3 = layers.Dense(128, activation = 'tanh')(dense2)
pred = layers.Dense(category_counts, activation='softmax')(dense3)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])  #rmsprop: 71
model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_5 (Lambda)            (None, 512)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_15 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_16 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_17 (Dense)             (None, 25)                3225      
Total params: 430,105
Trainable params: 430,105
Non-t

## Train Keras model and save weights
This only train and save our Keras layers not the embed module' weights.

In [23]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  history = model.fit(train_text, 
            train_label,
            validation_data=(test_text, test_label),
            epochs=10,
            batch_size=1000)
  model.save_weights('./model.h5')

Train on 35000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
!ls -alh | grep model.h5

-rw-r--r-- 1 root root 534K Jun 10 03:54 model.h5


## Make predictions

In [None]:
new_text = ["Mission statement"]
new_text = np.array(new_text, dtype=object)[:, np.newaxis]
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights('./model.h5')  
  predicts = model.predict(new_text, batch_size=32)