<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Exercise_5_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![PALS0039 Logo](https://www.phon.ucl.ac.uk/courses/pals0039/images/pals0039logo.png)](https://www.phon.ucl.ac.uk/courses/pals0039/)

#Exercise 5.3

In this exercise we load some pre-trained word embeddings and use them to build a simple part-of-speech tagger.

(a) Load in Glove embeddings. Run the code and add comments.

In [None]:
# 
import numpy as np
import pandas as pd

# 
df=pd.read_csv('https://www.phon.ucl.ac.uk/courses/pals0039/data/glove.6B.100d.zip',header=None)
df.rename(columns={0:"word"},inplace=True)

# 
print("Read %d word embeddings of length %d" % (len(df),len(df.columns)-1))
df.head()

---
(b) Build a look-up dictionary. Run the code and add comments.

In [None]:
# 
word_index={}
for i,word in enumerate(df.word):
  word_index[word]=i

# 
print("#words",len(word_index))
print(word_index['the'],word_index['white'],word_index['cat'])
print(df.word[word_index['the']],df.word[word_index['white']],df.word[word_index['cat']])

# 
word_embed=np.array(df.iloc[:,1:])

---
(c) Compute word similarities. Run the code and add comments. Then try with different example words.

In [None]:
# 
def get_vector(word):
  vec=word_embed[word_index[word],:]
  return vec

# 
def similarity(a,b):
  dot = np.dot(a, b)
  norma = np.linalg.norm(a)
  normb = np.linalg.norm(b)
  return dot / (norma * normb)

# 
def word_similarity(a,b):
  return similarity(get_vector(a),get_vector(b))

# 
print(word_similarity("cat","dog"))
print(word_similarity("cat","crocodile"))
print(word_similarity("cat","car"))


---
(d) Vector arithmetic on word meanings. Run the code and add comments. Then try some different word examples.

In [None]:
# 
def closest_word(vec,butnotword=None):
  maxsim=0
  maxidx=0
  for i in range(word_embed.shape[0]):
    d=similarity(vec,word_embed[i,:])
    if d > maxsim and i!=butnotword:
      maxsim=d
      maxidx=i
  return df.word[maxidx]

# 
def find_analogy(a,b,c):
  return closest_word(get_vector(b)-get_vector(a)+get_vector(c),word_index[b])

# 
print(find_analogy('man','king','woman'))
print(find_analogy('england','london','scotland'))

---
(e) Load a part of speech corpus for training a POS tagger. Run the code and add comments.

In [None]:
#
import nltk
# 
nltk.download('conll2000')
# 
from nltk.corpus import conll2000

# 
%tensorflow_version 2.x
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical

In [None]:
# 
train_words=conll2000.tagged_words('train.txt')
test_words=conll2000.tagged_words('test.txt')
print(train_words[:10])
print(test_words[:10])


---
(f) Build tag index. Run the code and add comments.

In [None]:
# 
tag_index={"UNK":0}
for item in train_words:
  # 
  tag=item[1]
  # 
  tag_index.setdefault(tag,len(tag_index))

print(tag_index)

---
(g) Map the corpus data to integers

In [None]:
# 
def gen_int_data(tagged_words,word_index,tag_index):
  # 
  X, Y = [], []
  # 
  unk_count=0;
  for word,tag in tagged_words:
    # 
    if word.lower() in word_index:
      #
      X.append(get_vector(word.lower()))
      Y.append(tag_index[tag])
    else:
      # 
      X.append(get_vector('.'))
      Y.append(tag_index[tag])
      unk_count += 1
  # 
  print("Data created. Unknown proportion %.3f" % (unk_count/len(tagged_words)))
  return np.array(X),np.array(Y)

# 
X_train,Y_train = gen_int_data(train_words,word_index,tag_index)
X_test,Y_test = gen_int_data(test_words,word_index,tag_index)


---
(h) Build the POS tagger model. Run the code and add comments.

In [None]:
# 
inpsize=X_train.shape[1]
hidsize=128
outsize=len(tag_index)

#
model=Sequential()
model.add(Dense(hidsize,activation='sigmoid',input_shape=(inpsize,)))
model.add(Dense(outsize,activation='softmax'))

# 
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

# 
model.summary()


---
(i) Train model. Run the code and add comments.

In [None]:
#
model.fit(X_train,Y_train,epochs=5,batch_size=64,verbose=1,validation_split=0.1)

---
(j) Evaluate model. Run the code and add comments.

In [None]:
# 
score,acc = model.evaluate(X_test,Y_test,verbose=0)
print("Test accuracy: %.2f" % (acc));

# 
Y_pred = np.argmax(model.predict(X_test,batch_size=128), axis=-1)

# use the pandas crosstabs function to calculate and print confusion matrix
y_actu = pd.Categorical.from_codes(Y_test, categories=tag_index.keys())
y_pred = pd.Categorical.from_codes(Y_pred, categories=tag_index.keys())
df_confusion = pd.crosstab(y_actu, y_pred, margins=True, normalize='index',dropna=False)
df_confusion

In [None]:
# use the matplotlib matshow function to draw a confusion matrix
import matplotlib.pyplot as plt
%matplotlib inline

def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
  plt.figure(figsize=(10,10))
  plt.matshow(df_confusion, cmap=cmap,fignum=1)
  plt.title("Confusion Matrix")
  tick_marks = np.arange(len(df_confusion))
  plt.xticks(tick_marks, df_confusion.index, rotation=45)
  plt.yticks(tick_marks, df_confusion.index)
  plt.ylabel(df_confusion.index.name)
  plt.xlabel(df_confusion.columns.name)
  plt.show()

df_confusion = pd.crosstab(y_actu, y_pred, margins=False, normalize='index',dropna=False)
plot_confusion_matrix(df_confusion)


In [None]:
# use the NLTK ConfusionMatrix function to print a confusion matrix
from nltk import ConfusionMatrix

# build reverse index for tags
id2tab={}
for k,v in tag_index.items():
  id2tab[v]=k

# collect all the data as named tags
POS_test=[]
POS_pred=[]
for i in range(len(Y_test)):
  POS_test.append(id2tab[Y_test[i]])
  POS_pred.append(id2tab[Y_pred[i]])

print(ConfusionMatrix(POS_test,POS_pred))


---
(k) Try and improve the performance of the tagger. Hints for things you might try:
<ul>
<li>Change the configuration of the DNN, its size, node types or training protocol.
<li>Reduce the size of the tagset by eliminating rare tags.
<li>Include in the input to the DNN the words on either side of the word to be tagged (or two words on each side, or ...)
</ul>

