<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Answers_5_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 5.2 Answers

Adapted from [https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network](https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296)

(a) Get the Brown Corpus

In [0]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import nltk
nltk.download('brown')
from nltk.corpus import brown


(b) Get tokenized sentences from Brown.

In [0]:
sentences=brown.sents()
print(sentences[:3])

(c) Get the opitmised Word2vec training code from gensim [https://radimrehurek.com/gensim/models/word2vec.html](https://radimrehurek.com/gensim/models/word2vec.html)

In [0]:
from gensim.models import Word2Vec


(d) Train the Word2Vec model (takes about a minute with GPU)

In [0]:
%%time
EMB_DIM=300
w2v=Word2Vec(sentences,size=EMB_DIM,window=5,min_count=5,negative=15,iter=10)

(e) Look up some similarities

In [0]:
word_vectors=w2v.wv
result=word_vectors.similar_by_word('Saturday')
print(result[:5])
result=word_vectors.similar_by_word('money')
print(result[:5])
result=word_vectors.similar_by_word('child')
print(result[:5])
result=word_vectors.most_similar(positive=['vehicle'],negative=['expensive'])
print(result[:5])

result=word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
print(result)
result=word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
print(result)
result=word_vectors.similarity('woman', 'man')
print(result)


(f) Build a simple part-of-speech tagger

In [0]:
nltk.download('conll2000')
from nltk.corpus import conll2000
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
import numpy as np
import collections


In [0]:
train_words=conll2000.tagged_words('train.txt')
test_words=conll2000.tagged_words('test.txt')
print(train_words[:10])


In [0]:
def get_tag_vocabulary(tagged_words):
  tag2id={}
  for item in tagged_words:
    tag=item[1]
    tag2id.setdefault(tag,len(tag2id))
  return tag2id

word2id={ k:v.index for k,v in word_vectors.vocab.items()}
tag2id=get_tag_vocabulary(train_words)

In [0]:
def get_int_data(tagged_words,word2id,tag2id):
  X, Y = [], []
  unk_count=0;
  for word,tag in tagged_words:
    if word in word2id:
      X.append(word2id.get(word))
      Y.append(tag2id.get(tag))
    else:
      unk_count += 1
  print("Data created. Unknown proportion %.3f" % (unk_count/len(tagged_words)))
  return np.array(X),np.array(Y)

X_train,Y_train = get_int_data(train_words,word2id,tag2id)
X_test,Y_test = get_int_data(test_words,word2id,tag2id)

Y_train, Y_test = to_categorical(Y_train), to_categorical(Y_test)

embedding_matrix=word_vectors.vectors


In [0]:
HIDDEN_SIZE=50
BATCH_SIZE=128

def define_model(embedding_matrix, class_count):
  vocab_length=len(embedding_matrix)
  model=Sequential()
  model.add(Embedding(input_dim=vocab_length,output_dim=EMB_DIM,weights=[embedding_matrix],input_length=1))
  model.add(Flatten())
  model.add(Dense(HIDDEN_SIZE,activation='tanh'))
  model.add(Dense(class_count,activation='softmax'))
  model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
  model.summary()
  return model

pos_model=define_model(embedding_matrix,len(tag2id))



In [0]:
pos_model.fit(X_train,Y_train,batch_size=BATCH_SIZE,epochs=5,verbose=1)

In [0]:
def evaluate_model(model,id2word,x_test,y_test):
    _,acc = model.evaluate(x_test,y_test)
    print("Accuracy: %.2f" % (acc));
    y_pred = model.predict_classes(x_test)
    error_counter = collections.Counter()
    for i in range(len(x_test)):
      correct_tag_id = np.argmax(y_test[i])
      if y_pred[i]!= correct_tag_id:
        word = id2word[x_test[i]]
        error_counter[word] += 1
    print("Most common errors:\n",error_counter.most_common(10))

id2word = sorted(word2id,key=word2id.get)
evaluate_model(pos_model,id2word,X_test,Y_test)
        