<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Answers_7_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![PALS0039 Logo](https://www.phon.ucl.ac.uk/courses/pals0039/images/pals0039logo.png)](https://www.phon.ucl.ac.uk/courses/pals0039/)

# Exercise 7.2 Answers


(a) Setup

In [0]:
import requests
import numpy as np

%tensorflow_version 2.x
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Embedding, Flatten, SimpleRNN, LSTM, GRU, Bidirectional, Dropout, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import get_file


(b) load text

In [0]:
url = "https://www.phon.ucl.ac.uk/courses/pals0039/data/cloze-corpus.txt"
response = requests.get(url)
raw_text = response.text.lower().replace('\n',' ')
print("Corpus has",len(raw_text),"characters")
print(raw_text[:250])

(c) Tokenize

In [0]:
max_words=10000

tokenizer = Tokenizer(num_words=max_words,oov_token="UNK")
tokenizer.fit_on_texts([raw_text])
word_index=tokenizer.word_index
print("Found",len(word_index),"different words.")

In [0]:
print(list(word_index.items())[:10])
print(list(word_index.items())[-10:])


In [0]:
raw_seq=tokenizer.texts_to_sequences([raw_text])[0]
print(raw_seq[:100])
print("Max index",max(raw_seq))
num_oov=sum(1 for w in raw_seq if w==1)
print("%OOV",100*num_oov/len(raw_seq))

(d) prepare for training

In [0]:
seq_len=100
nseq=len(raw_seq)//seq_len
seq=np.reshape(raw_seq[:nseq*seq_len],(nseq,seq_len))   # convert to list of sequences
seq_shift=np.roll(seq,-1)     # target is sequence shifted back one step
p = np.random.permutation(nseq)
seq=seq[p]
seq_shift=seq_shift[p];

nval=nseq//10

Xval=seq[:nval,:]
yval=seq_shift[:nval,:]
Xtrain=seq[nval:,:]
ytrain=seq_shift[nval:,:]

print(Xtrain.shape,ytrain.shape)
print(Xval.shape,yval.shape)

print(Xtrain[0,:10],ytrain[0,:10])


(e) build model

In [0]:
import tensorflow as tf
def perplexity(y_true, y_pred):
    cross_entropy = tf.losses.sparse_categorical_crossentropy(y_true, y_pred)
    perplexity = tf.exp(tf.reduce_mean(cross_entropy))
    return perplexity

osize=max_words

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64,input_length=seq_len))
model.add(LSTM(256,return_sequences=True,activation='tanh'))
model.add(LSTM(256,return_sequences=True,activation='tanh'))
model.add(TimeDistributed(Dense(osize, activation='softmax')));
#
# compile the network
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=[perplexity])
print(model.summary())

(f) train model

In [0]:
# train the model
history=model.fit(Xtrain,ytrain, batch_size=64, validation_data=(Xval,yval), epochs=25)
#print(history.history)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
model_save_name = 'ex72.h5'
path = "/content/gdrive/My Drive/"+model_save_name
model.save(path,)

(g) load test data for cloze task

In [0]:
#from google.colab import drive
#drive.mount('/content/gdrive')
model_save_name = 'ex72.h5'
path = "/content/gdrive/My Drive/"+model_save_name
model=load_model(path, custom_objects={'perplexity': perplexity})

#url = "https://www.phon.ucl.ac.uk/courses/pals0039/data/"+model_save_name
#file = get_file(model_save_name,url,cache_subdir="models")
#model=load_model(file, custom_objects={'perplexity': perplexity})


In [0]:
import pandas as pd

df=pd.read_csv("https://www.phon.ucl.ac.uk/courses/pals0039/data/cloze-test.csv",keep_default_na=False)
df.head()

(h) encode the cloze test data using the tokenizer

In [0]:
# concatenate the context and the query
cloze_context=[]
cloze_answer=[]
cloze_alter=[]
for i in range(len(df)):
  str=df.CONTEXT.iat[i]+" "+df.CONTEXT.iat[i]+" "+df.QUERY.iat[i]
  cloze_context.append(str)
  cloze_answer.append(df.ANSWER.iat[i])
  cloze_alter.append((df.ALTERNATIVES.iat[i]).split('|'))

cloze_context_seq=tokenizer.texts_to_sequences(cloze_context)
cloze_answer_seq=tokenizer.texts_to_sequences(cloze_answer)
cloze_alter_seq=tokenizer.texts_to_sequences(cloze_alter)

In [0]:

print(cloze_context[0])
print(cloze_context_seq[0])
print(cloze_answer[:10])
print(cloze_answer_seq[:10])
print(cloze_alter[:10])
print(cloze_alter_seq[:10])


(i) run the model over the sequences to get pdf over next word

In [0]:
print(seq_len)
seq_len=100
# chop all lists down to seq_len values
print(cloze_context_seq[:5])
x=cloze_context_seq[0]
print(x)
print(x[-seq_len:])
lengths=[ len(x) for x in cloze_context_seq]
print(min(lengths),max(lengths))
cloze_context_lim=np.stack(np.array([ x[-seq_len:] for x in cloze_context_seq]))
print(cloze_context_lim[0])
print(cloze_context_lim[1])
print(cloze_context_lim.shape)

In [0]:
block_size=100
nblock=cloze_context_lim.shape[0]//block_size
ypred=np.zeros((nblock*block_size,max_words))
for i in range(nblock):
  pred=model.predict(cloze_context_lim[i*block_size:(i+1)*block_size,:],batch_size=50)
  ypred[i*block_size:(i+1)*block_size,:]=pred[:,-1,:]   # just pdf of last word of each sentence
print(ypred.shape)



In [0]:
# find alternative with highest probability
ntest=ypred.shape[0]
ncorrect=0;
for i in range(ntest):
  nprob=len(cloze_alter_seq[i])
  prob=np.zeros(nprob)
  for j in range(nprob):
    prob[j]=ypred[i,cloze_alter_seq[i][j]]
  top_word=cloze_alter_seq[i][np.argmax(prob)]
  correct_word=cloze_answer_seq[i][0]
  if (top_word==correct_word):
    ncorrect += 1
  #print(i,correct_word,top_word,ncorrect)

print("Correct: %d/%d (%.1f%%)" % (ncorrect,ntest,100*ncorrect/ntest))
