<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Answers_8_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![PALS0039 Logo](https://www.phon.ucl.ac.uk/courses/pals0039/images/pals0039logo.png)](https://www.phon.ucl.ac.uk/courses/pals0039/)

# Exercise 8.2 Answers

Exercise developed from: [Semantic similarity with TF-Hub Universal Encoder](https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb)

(a) set up

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd


(b) load sentence encoder

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

(c) try out a few emebddings

In [0]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

def print_embedding(messg):
  print("Message:",messg)
  embedding = embed([messg])[0].numpy()
  print("Embedding Size:",len(embedding))
  print("Embedding:",embedding[:5],"...\n")

for messg in messages:
  print_embedding(messg)


(d) Compare similarity of sentences

In [0]:
messages = [
    # Smartphones
    "I like my phone",
    "My phone is not good.",
    "Your cellphone looks great.",

    # Weather
    "Will it snow tomorrow?",
    "Recently a lot of hurricanes have hit the US",
    "Global warming is real",

    # Food and health
    "An apple a day, keeps the doctors away",
    "Eating strawberries is healthy",
    "Is paleo better than keto?",

    # Asking about age
    "How old are you?",
    "what is your age?",
]

message_embeddings = embed(messages)

corr = np.inner(message_embeddings,message_embeddings)
#print(corr)

plt.figure(figsize=(6,6))
plt.imshow(corr,origin='upper',cmap="YlOrRd",aspect='auto')
ax = plt.gca()
ax.set_yticks(range(len(messages)))
ax.set_yticklabels(messages)
plt.show()


(e) load a document classification task

In [0]:
'''Trains and evaluate a simple MLP
on the Reuters newswire topic classification task.
'''

import numpy as np
%tensorflow_version 2.x
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

max_words = 1000
batch_size = 32
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

dictionary=reuters.get_word_index();
reverse_dictionary={0:"padding",1:"BOS",2:"UNK"}
for (key,value) in dictionary.items():
  reverse_dictionary[value+3]=key

print(list(map(reverse_dictionary.get, range(10))))

def reuters_text(seq):
  text=[]
  for widx in seq:
    text.append(reverse_dictionary[widx])
  return " ".join(text)

print("First review:",reuters_text(x_train[0]))

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])
