<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Answers_5_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![PALS0039 Logo](https://www.phon.ucl.ac.uk/courses/pals0039/images/pals0039logo.png)](https://www.phon.ucl.ac.uk/courses/pals0039/)

#Exercise 5.3 Answers

(a) Load in Glove embeddings

In [0]:
import numpy as np
import pandas as pd

df=pd.read_csv('https://www.phon.ucl.ac.uk/courses/pals0039/data/glove.6B.100d.zip',header=None)
df.rename(columns={0:"word"},inplace=True)
print("Read %d word embeddings of length %d" % (len(df),len(df.columns)-1))
df.head()

(b) Build a look-up dictionary

In [0]:
word_index={}
for i,word in enumerate(df.word):
  word_index[word]=i

print("#words",len(word_index))
print(word_index['the'],word_index['white'],word_index['cat'])
print(df.word[word_index['the']],df.word[word_index['white']],df.word[word_index['cat']])

word_embed=np.array(df.iloc[:,1:])

(c) Compute word similarities

In [0]:
def get_vector(word):
  vec=word_embed[word_index[word],:]
  #print(word,vec[:10])
  return vec

def similarity(a,b):
  dot = np.dot(a, b)
  norma = np.linalg.norm(a)
  normb = np.linalg.norm(b)
  return dot / (norma * normb)

def word_similarity(a,b):
  return similarity(get_vector(a),get_vector(b))

print(word_similarity("cat","dog"))
print(word_similarity("cat","crocodile"))
print(word_similarity("cat","car"))


(d) Vector arithmetic on word meanings

In [0]:
def closest_word(vec):
  print(vec[:10])
  maxsim=0
  maxidx=0
  for i in range(word_embed.shape[0]):
    #d=similarity(vec,word_embed[i,:])
    d=1-np.sqrt(np.mean(np.square(vec-word_embed[i,:])))
    if d > maxsim:
      maxsim=d
      maxidx=i
  return df.word[maxidx]

print(closest_word(get_vector('prince')-get_vector('boy')+get_vector('man')))


(e) Part of speech corpus

In [0]:
import nltk
nltk.download('conll2000')
from nltk.corpus import conll2000
%tensorflow_version 2.x
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical

In [0]:
train_words=conll2000.tagged_words('train.txt')
test_words=conll2000.tagged_words('test.txt')
print(train_words[:10])
print(test_words[:10])


(f) build tag index

In [0]:
# build index to tags
tag_index={"UNK":0}
for item in train_words:
  tag=item[1]
  tag_index.setdefault(tag,len(tag_index))

print(tag_index)

(g) map all to integer data

In [0]:
# map all to integer data
def gen_int_data(tagged_words,word_index,tag_index):
  X, Y = [], []
  unk_count=0;
  for word,tag in tagged_words:
    if word.lower() in word_index:
      X.append(get_vector(word.lower()))
      Y.append(tag_index[tag])
    else:
      X.append(get_vector('.'))
      Y.append(tag_index[tag])
      unk_count += 1
  print("Data created. Unknown proportion %.3f" % (unk_count/len(tagged_words)))
  return np.array(X),np.array(Y)

X_train,Y_train = gen_int_data(train_words,word_index,tag_index)
X_test,Y_test = gen_int_data(test_words,word_index,tag_index)


(h) Build POS model

In [0]:
def define_model(inpsize,hidsize,outsize):
  model=Sequential()
  model.add(Dense(hidsize,activation='sigmoid',input_shape=(inpsize,)))
  model.add(Dense(outsize,activation='softmax'))
  model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
  model.summary()
  return model

pos_model=define_model(X_train.shape[1],128,len(tag_index))


(i) Train model

In [0]:
pos_model.fit(X_train,Y_train,epochs=5,batch_size=64,verbose=1,validation_split=0.1)

(j) Evaluate model

In [0]:
score,acc = pos_model.evaluate(X_test,Y_test,verbose=0)
print("Test accuracy: %.2f" % (acc));

Y_pred = pos_model.predict_classes(X_test,batch_size=128)

print(Y_test[:10],Y_pred[:10])

y_actu = pd.Series(Y_test, name='Actual')
y_pred = pd.Series(Y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, margins=True)
print(df_confusion/ df_confusion.sum(axis=1))

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
  rowsums=np.sum(df_confusion,axis=1)
  confusion=100*df_confusion/rowsums;
  plt.figure(figsize=(10,10))
  plt.matshow(confusion, cmap=cmap,fignum=1) # imshow
  #plt.title(title)
  plt.colorbar()
  tick_marks = np.arange(len(df_confusion.columns))
  plt.xticks(tick_marks, df_confusion.columns, rotation=45)
  plt.yticks(tick_marks, df_confusion.index)
  #plt.tight_layout()
  plt.ylabel(df_confusion.index.name)
  plt.xlabel(df_confusion.columns.name)
  plt.show()

df_confusion = pd.crosstab(y_actu, y_pred)
plot_confusion_matrix(df_confusion)


In [0]:
from nltk import ConfusionMatrix

id2tab={}
for k,v in tag_index.items():
  id2tab[v]=k
POS_test=[]
POS_pred=[]
for i in range(len(Y_test)):
  POS_test.append(id2tab[Y_test[i]])
  POS_pred.append(id2tab[Y_pred[i]])

print(ConfusionMatrix(POS_test,POS_pred))