# [Keras + Universal Sentence Encoder = Transfer Learning for text data](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) Tutorial
## Universal Sentence Encoder

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

Using TensorFlow backend.


In [4]:
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
df_train['mission_prgrm']=df_train['mission']+'; '+df_train['prgrm_dsc']

small_num=0
while small_num<100: # Make sure each category has at least 100 records.
    trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(40000)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    
#### Shuffle ####
trainDF = trainDF.sample(frac=1)

df_train = pd.DataFrame()
df_train['text'] = trainDF['mission_prgrm'][:35000]
df_train['label'] = trainDF['NTEE1'][:35000].astype('category')

df_train.head()


592785    K
592789    N
592797    L
592800    O
592823    Y
592836    B
592847    W
592851    D
592863    P
592871    A
592915    S
948786    J
948912    E
948920    H
948924    U
949011    R
949035    I
949137    F
949269    X
949279    Q
949322    C
949615    G
949640    T
951430    M
956483    V
Name: NTEE1, dtype: object


Unnamed: 0,text,label
1851452,"TO PROMOTE THE PROPAGATION OF WILDLIFE, FOSTER...",N
1643413,"FRIENDS SCHOOL IS AN INDEPENDENT, COEDUCATIONA...",B
1165595,Encourage High School seniors to explore the e...,B
424733,TO COORDINATE AND PROMOTE THE GROWTH OF TENNIS...,N
571969,"TO PROVIDE RELIEF TO POOR, DISTRESSED AND UNDE...",P


In [5]:
category_counts = len(df_train.label.cat.categories)
category_counts

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'],
      dtype='object')

## Wrap embed module in a Lambda layer
Explicitly cast the input as a string

In [7]:
train_text = df_train['text'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)

In [8]:
train_text.shape

(35000, 1)

In [9]:
train_label.shape

(35000, 25)

In [10]:
df_test = pd.DataFrame()
df_test['text'] = trainDF['mission_prgrm'][35000:]
df_test['label'] = trainDF['NTEE1'][35000:].astype('category')

df_test.head()

Unnamed: 0,text,label
1923689,IN AN ENVIRONMENT OF CHRISTIAN LOVE AND SERVIC...,P
2282676,PROVIDE A MEETING PLACE FOR AND ASSISTANCE TO ...,W
1121670,TO RAISE FUNDS TO ASSIST OTHER ONN-PROFIT ORGA...,W
1997700,PROVIDE SUPPORT TO THE SIGMA ALPHA EPSILON FRA...,B
2360642,TO PROVIDE HOUSING TO THE ELDERLY OR DISABLED....,L


In [11]:
test_text = df_test['text'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

In [66]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
embed = hub.Module(module_url)

embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
embed_size

SEQ_LENGTH = 500
EMBEDDING_DIM = embed_size

def Embed_pad(sentence):
    words = sentence.split()
    none = ['NaN'] * (1000-len(words))
    words = words + none
    words = tf.constant(words)
    words = tf.reshape(words, [-1])
    result = embed(words)
    result = tf.reshape(result, [1, SEQ_LENGTH, EMBEDDING_DIM]) # the second array can be constructed with tf.concat, tf.shape(words) and [-1].

    return result
    
Embed_pad("This is just a sample to check the output")

'''
x = tf.constant("This is string")
x = tf.decode_raw(x, tf.uint8)
print(x)
y = x[:4]
sess = tf.InteractiveSession()
print(y.eval())
'''

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


'\nx = tf.constant("This is string")\nx = tf.decode_raw(x, tf.uint8)\nprint(x)\ny = x[:4]\nsess = tf.InteractiveSession()\nprint(y.eval())\n'

In [60]:
input_text = layers.Input(shape=(1,), dtype=tf.string)
#embedding = layers.Embedding(UniversalEmbedding, output_dim=embed_size)(input_text)
embedding = layers.Lambda(Embed_pad)(input_text)
#embedto3 = layers.Embedding(input_dim = 1000, output_dim = 512)(embedding)
conv = layers.GRU(filters=512, activation='tanh')(embedding)
dense = layers.Dense(256, activation='tanh')(conv)
pred = layers.Dense(category_counts, activation='softmax')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

AttributeError: 'Tensor' object has no attribute 'split'

## Train Keras model and save weights
This only train and save our Keras layers not the embed module' weights.

In [20]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  history = model.fit(train_text, 
            train_label,
            validation_data=(test_text, test_label),
            epochs=10,
            batch_size=1000)
  model.save_weights('./model.h5')

ValueError: Error when checking input: expected input_6 to have shape (1000,) but got array with shape (1,)

In [None]:
!ls -alh | grep model.h5

-rw-r--r-- 1 root root 534K Jun 10 03:54 model.h5


## Make predictions

In [None]:
new_text = ["Mission statement"]
new_text = np.array(new_text, dtype=object)[:, np.newaxis]
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights('./model.h5')  
  predicts = model.predict(new_text, batch_size=32)