# 26 Jan - Dependency Parser

In [1]:
# Import libraries
import sys
import numpy as np
import time
import os
import logging
from collections import Counter
from datetime import datetime
import math

from tqdm import tqdm
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim

### 1. Parsing Function

In [2]:
# Create a class for parsing
class Parsing(object):
    def __init__(self, sentence):
        self.sentence = sentence
        self.stack = ['ROOT']
        self.buffer = sentence[:]
        self.dep = []

    def parse_step(self, transition):
        if transition == 'S':
            buffer_head = self.buffer.pop(0)
            self.stack.append(buffer_head)

        elif transition == 'LA':
            dependent = self.stack.pop(-2)
            self.dep.append((self.stack[-1], dependent))
        
        elif transition == 'RA':
            dependent = self.stack.pop()
            self.dep.append((self.stack[-1], dependent))
        else:
            print(f"Unknown transition: {transition}")
    
    def parse(self, transitions):
        for transition in transitions:
            self.parse_step(transition)
        return self.dep
    
    def is_completed(self):
        return (len(self.buffer) == 0) and (len(self.stack) == 1)

In [3]:
# Test the parsing class
parsing = Parsing(['He', 'has', 'good', 'control', '.'])
parsing.parse(["S","S", "LA", "S", "S", "LA", "RA"])
parsing.stack, parsing.buffer, parsing.dep

(['ROOT', 'has'],
 ['.'],
 [('has', 'He'), ('control', 'good'), ('has', 'control')])

In [4]:
# Create a minibatch parsing function
def minibatch_parse(sentences, model, batch_size):
    dep = []
    partial_parses = [Parsing(sentence) for sentence in sentences]
    unfinished_parses = partial_parses[:]

    while unfinished_parses:
        minibatch = unfinished_parses[:batch_size]
        transitions = model.predict(minibatch)

        for transition, partial_parse in zip(transitions, minibatch):
            partial_parse.parse_step(transition)

        unfinished_parses[:] = [p for p in unfinished_parses if not p.is_completed()]
    
    dep = [parse.dep for parse in partial_parses]

    return dep

In [5]:
# Create a dummy model to predict transitions
class DummyModel(object):
    def predict(self, partial_parses):
        return [("RA" if pp.stack[1] == "right" else "LA")
                if len(pp.buffer) == 0 else "S"
                for pp in partial_parses] 

In [6]:
# Test the minibatch parsing function
sentences = [["right", "arcs", "only"],
             ["right", "arcs", "only", "again"],
             ["left", "arcs", "only"],
             ["left", "arcs", "only", "again"]]

minibatch_parse(sentences, DummyModel(), 2)

[[('arcs', 'only'), ('right', 'arcs'), ('ROOT', 'right')],
 [('only', 'again'), ('arcs', 'only'), ('right', 'arcs'), ('ROOT', 'right')],
 [('only', 'arcs'), ('only', 'left'), ('only', 'ROOT')],
 [('again', 'only'), ('again', 'arcs'), ('again', 'left'), ('again', 'ROOT')]]

### 2. Loading Data

In [7]:
# Create a function to read files in CoNLL format
def read_conll(filename):
    
    examples = []
    
    with open(filename) as f:
        i = 0
        word, pos, head, dep = [], [], [], []
        for line in f.readlines():
            i = i+1
            wa = line.strip().split('\t')

            if len(wa) == 10:
                word.append(wa[1].lower())
                pos.append(wa[4])
                head.append(int(wa[6]))
                dep.append(wa[7])
            
            elif len(word) > 0:
                examples.append({'word': word, 'pos': pos, 'head': head, 'dep': dep})
                word, pos, head, dep = [], [], [], []
        
        if len(word) > 0:
            examples.append({'word': word, 'pos': pos, 'head': head, 'dep': dep})

    return examples

In [8]:
# Create a function to load the English Penn Treebank dataset
def load_data():
    print("1. Loading data")
    train_set = read_conll("data/train.conll")
    dev_set   = read_conll("data/dev.conll")
    test_set   = read_conll("data/test.conll")
    
    #make my dataset smaller because my laptop cannot handle it
    train_set = train_set[:1000]
    dev_set   = dev_set[:500]
    test_set  = test_set[:500]
    
    return train_set, dev_set, test_set

In [9]:
# Test the load function
train_set, dev_set, test_set = load_data()

len(train_set), len(dev_set), len(test_set)

1. Loading data


(1000, 500, 500)

### 3. Parser

In [10]:
# Create a parser based on this paper: https://aclanthology.org/D14-1082.pdf
P_PREFIX = '<p>:' #indicating pos tags
D_PREFIX = '<d>:' #indicating dependency tags
UNK      = '<UNK>'
NULL     = '<NULL>'
ROOT     = '<ROOT>'

class Parser(object):

    def __init__(self, dataset):
        
        #set the root dep
        self.root_dep = 'root'
                
        #get all the dep of the dataset as list, e.g., ['root', 'acl', 'nmod', 'nmod:npmod']
        all_dep = [self.root_dep] + list(set([w for ex in dataset
                                               for w in ex['dep']
                                               if w != self.root_dep]))
        
        #1. put dep into tok2id lookup table, with D_PREFIX so we know it is dependency
        #{'D_PREFIX:root': 0, 'D_PREFIX:acl': 1, 'D_PREFIX:nmod': 2, ..., 'D_PREFIX:<NULL>': 30}
        tok2id = {D_PREFIX + l: i for (i, l) in enumerate(all_dep)}
        tok2id[D_PREFIX + NULL] = self.D_NULL = len(tok2id)
        
        #we are using "unlabeled" where we do not label with the dependency
        #thus the number of dependency relation is 1
        trans = ['L', 'R', 'S']
        self.n_deprel = 1
        
        #create a simple lookup table mapping action and id
        #e.g., tran2id: {'L': 0, 'R': 1, 'S': 2}
        #e.g., id2tran: {0: 'L', 1: 'R', 2: 'S'}
        self.n_trans = len(trans)
        self.tran2id = {t: i for (i, t) in enumerate(trans)}
        self.id2tran = {i: t for (i, t) in enumerate(trans)}

        #2. put pos tags into tok2id lookup table, with P_PREFIX so we know it is pos
        tok2id.update(build_dict([P_PREFIX + w for ex in dataset for w in ex['pos']],
                                  offset=len(tok2id)))
        tok2id[P_PREFIX + UNK]  = self.P_UNK  = len(tok2id)  #also remember the pos tags of unknown
        tok2id[P_PREFIX + NULL] = self.P_NULL = len(tok2id)
        tok2id[P_PREFIX + ROOT] = self.P_ROOT = len(tok2id)
        
        #now tok2id:  {'P_PREFIX:root': 0, 'P_PREFIX:acl': 1, ..., 'P_PREFIX:JJR': 62, 'P_PREFIX:<UNK>': 63, 'P_PREFIX:<NULL>': 64, 'P_PREFIX:<ROOT>': 65}
        
        #3. put word into tok2id lookup table
        tok2id.update(build_dict([w for ex in dataset for w in ex['word']],
                                  offset=len(tok2id)))
        tok2id[UNK]  = self.UNK = len(tok2id)
        tok2id[NULL] = self.NULL = len(tok2id)
        tok2id[ROOT] = self.ROOT = len(tok2id)
        
        #now tok2id: {'D_PREFIX:root': 0, 'D_PREFIX:acl': 1, 'D_PREFIX:nmod': 2, ..., 'memory': 340, 'mr.': 341, '<UNK>': 342, '<NULL>': 343, '<ROOT>': 344}
        
        #create id2tok
        self.tok2id = tok2id
        self.id2tok = {v: k for (k, v) in tok2id.items()}
        
        #why 18 normal features + 18 (pos) + 12 (dep)
        #18 features - top 3 words on buffer, top 3 words on stack, 
        # the first and second left most/rightmost children of the top two words on the stack
        # the leftmost of leftmost/rightmost of rightmost children of the top two words on the stack
        #18 pos - basically corresponding POS tags
        #12 dep - corresponding ARC, excluding 6 words on hte stack/buffer..
        self.n_features = 18 + 18 + 12
        self.n_tokens = len(tok2id)
    
    #function to turn train set with words to train set with id instead using tok2id
    def numericalize(self, examples):
        numer_examples = []
        for ex in examples:
            word = [self.ROOT] + [self.tok2id[w] if w in self.tok2id
                                  else self.UNK for w in ex['word']]
            pos  = [self.P_ROOT] + [self.tok2id[P_PREFIX + w] if P_PREFIX + w in self.tok2id
                                   else self.P_UNK for w in ex['pos']]
            head = [-1] + ex['head']
            dep  = [-1] + [self.tok2id[D_PREFIX + w] if D_PREFIX + w in self.tok2id
                            else -1 for w in ex['dep']]
            numer_examples.append({'word': word, 'pos': pos,
                                 'head': head, 'dep': dep})
        return numer_examples

    #function to extract features to form a feature embedding matrix
    def extract_features(self, stack, buf, arcs, ex):
             
        #ex['word']:  [55, 32, 33, 34, 35, 30], i.e., ['root', 'ms.', 'haag', 'plays', 'elianti', '.']
        #ex['pos']:   [29, 14, 14, 16, 14, 17], i.e., ['NNP', 'NNP', 'VBZ', 'NNP', '.']
        #ex['head']:  [-1, 2, 3, 0, 3, 3]  or ['root', 'compound', 'nsubj', 'root', 'dobj', 'punct']}
        #ex['dep']:   [-1, 1, 2, 0, 6, 12] or ['compound', 'nsubj', 'root', 'dobj', 'punct']

        #stack     :  [0]
        #buffer    :  [1, 2, 3, 4, 5]
        
        if stack[0] == "ROOT":
            stack[0] = 0  #start the stack with [ROOT]

        #get leftmost children based on the dependency arcs
        def get_lc(k):
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] < k])

        #get right most children based on the dependency arcs
        def get_rc(k):
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] > k],
                          reverse=True)

        p_features = [] #pos features (2a, 2b, 2c) - 18
        d_features = [] #dep features (3b, 3c) - 12
        
        #last 3 things on the stack as features
        #if the stack is less than 3, then we simply append NULL from the left
        features = [self.NULL] * (3 - len(stack)) + [ex['word'][x] for x in stack[-3:]]
        
        # next 3 things on the buffer as features
        #if the buffer is less than 3, simply append NULL
        #the reason why NULL is appended on end because buffer is read left to right
        features += [ex['word'][x] for x in buf[:3]] + [self.NULL] * (3 - len(buf))
        
        #corresponding pos tags
        p_features = [self.P_NULL] * (3 - len(stack)) + [ex['pos'][x] for x in stack[-3:]]
        p_features += [ex['pos'][x] for x in buf[:3]] + [self.P_NULL] * (3 - len(buf))
        
        #get the leftmost and rightmost children of the top two words, thus we loop 2 times
        for i in range(2):
            if i < len(stack):
                k = stack[-i-1] #-1, -2 last two in the stack
                
                #the first and second lefmost/rightmost children of the top two words (i=1, 2) on the stack
                lc = get_lc(k)  
                rc = get_rc(k)
                
                #the leftmost of leftmost/rightmost of rightmost children of the top two words on the stack:
                llc = get_lc(lc[0]) if len(lc) > 0 else []
                rrc = get_rc(rc[0]) if len(rc) > 0 else []

                #(leftmost of first word on stack, rightmost of first word, 
                # leftmost of the second word on stack, rightmost of second, 
                # leftmost of leftmost, rightmost of rightmost
                features.append(ex['word'][lc[0]] if len(lc) > 0 else self.NULL)
                features.append(ex['word'][rc[0]] if len(rc) > 0 else self.NULL)
                features.append(ex['word'][lc[1]] if len(lc) > 1 else self.NULL)
                features.append(ex['word'][rc[1]] if len(rc) > 1 else self.NULL)
                features.append(ex['word'][llc[0]] if len(llc) > 0 else self.NULL)
                features.append(ex['word'][rrc[0]] if len(rrc) > 0 else self.NULL)

                #corresponding pos
                p_features.append(ex['pos'][lc[0]] if len(lc) > 0 else self.P_NULL)
                p_features.append(ex['pos'][rc[0]] if len(rc) > 0 else self.P_NULL)
                p_features.append(ex['pos'][lc[1]] if len(lc) > 1 else self.P_NULL)
                p_features.append(ex['pos'][rc[1]] if len(rc) > 1 else self.P_NULL)
                p_features.append(ex['pos'][llc[0]] if len(llc) > 0 else self.P_NULL)
                p_features.append(ex['pos'][rrc[0]] if len(rrc) > 0 else self.P_NULL)
            
                #corresponding dep
                d_features.append(ex['dep'][lc[0]] if len(lc) > 0 else self.D_NULL)
                d_features.append(ex['dep'][rc[0]] if len(rc) > 0 else self.D_NULL)
                d_features.append(ex['dep'][lc[1]] if len(lc) > 1 else self.D_NULL)
                d_features.append(ex['dep'][rc[1]] if len(rc) > 1 else self.D_NULL)
                d_features.append(ex['dep'][llc[0]] if len(llc) > 0 else self.D_NULL)
                d_features.append(ex['dep'][rrc[0]] if len(rrc) > 0 else self.D_NULL)
                
            else:
                #attach NULL when they don't exist
                features += [self.NULL] * 6
                p_features += [self.P_NULL] * 6
                d_features += [self.D_NULL] * 6

        features += p_features + d_features
        assert len(features) == self.n_features  #assert they are 18 + 18 + 12
        return features

    #decide whether to shift, leftarc, or rightarc, based on gold parse trees
    #this is needed to create training examples which contain samples and ground truth
    def get_oracle(self, stack, buf, ex):

        #leave if the stack is only 1, thus nothing to predict....
        if len(stack) < 2:
            return self.n_trans - 1

        #predict based on the last two words on the stack
        i0 = stack[-1]
        i1 = stack[-2]

        #get the head and dependency
        h0 = ex['head'][i0]
        h1 = ex['head'][i1]
        d0 = ex['dep'][i0]
        d1 = ex['dep'][i1]

        #either shift, left arc or right arc
        #"Shift" = 2; "LA" = 0; "RA" = 1
        #if head of the second last word is the last word, then leftarc
        if (i1 > 0) and (h1 == i0):
            return 0
        #if head of the last word is the second last word, then rightarc
        #make sure nothing in the buffer has head with the last word on the stack
        #otherwise, we lose the last word.....
        elif (i1 >= 0) and (h0 == i1) and \
                (not any([x for x in buf if ex['head'][x] == i0])):
            return 1
        #otherwise shift, if something is left in buffer, otherwise, do nothing....
        else:
            return None if len(buf) == 0 else 2

    #generate training examples
    #from the training sentences and their gold parse trees 
    def create_instances(self, examples):
        all_instances = []
        
        for i, ex in enumerate(examples):
            #e.g., ex['word]: [344, 163, 99, 164, 165, 68]
            n_words = len(ex['word']) - 1  #excluding the root

            #arcs = {(head, tail, dependency label)}
            stack = [0]
            buf = [i + 1 for i in range(n_words)]  #[1, 2, 3, 4, 5]
            arcs = []
            instances = []
            
            #because that's the maximum number of shift, leftarcs, rightarcs you can have
            #this will determine the sample size of each training example
            #if given five words, we will get a sample of (10, 48) where 10 comes from 5 * 2, and 48 is n_features
            #but this for loop can be break if there is nothing left....
            for i in range(n_words * 2):

                #get the gold transition based on the parse trees
                #gold_t can be either shift(2), leftarc(0), or rightarc(1)
                gold_t = self.get_oracle(stack, buf, ex)
                
                #if gold_t is None, no need to extract features.....
                if gold_t is None:
                    break
                
                #make sure when the model predicts, we inform the current state of stack and buffer, so
                #the model is not allowed to make any illegal action, e.g., buffer is empty but trying to pop
                legal_labels = self.legal_labels(stack, buf)                
                assert legal_labels[gold_t] == 1
               
                #extract all the 48 features 
                features = self.extract_features(stack, buf, arcs, ex)
                instances.append((features, legal_labels, gold_t))
            
                #shift 
                if gold_t == 2:
                    stack.append(buf[0])
                    buf = buf[1:]
                #left arc 
                elif gold_t == 0:
                    arcs.append((stack[-1], stack[-2], gold_t))
                    stack = stack[:-2] + [stack[-1]]
                #right arc
                else:
                    arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel))
                    stack = stack[:-1]
            else:
                all_instances += instances

        return all_instances

    #provide an one hot encoding of the labels
    def legal_labels(self, stack, buf):
        labels =  ([1] if len(stack) > 2  else [0]) * self.n_deprel   #left arc   But cannot ROOT <----He thus 3
        labels += ([1] if len(stack) >= 2 else [0]) * self.n_deprel   #right arc  ROOT--->He
        labels += [1] if len(buf) > 0 else [0]   #shift
        return labels
    
    #a simple function to check punctuation POS tags
    def punct(self, pos):
        return pos in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]

    def parse(self, dataset, eval_batch_size=5000):
        sentences = []
        sentence_id_to_idx = {}
                
        for i, example in enumerate(dataset):
            
            #example['word']=[188, 186, 186, ..., 59]
            #n_words=37
            #sentence=[1, 2, 3, 4, 5,.., 37]
                        
            n_words = len(example['word']) - 1
            sentence = [j + 1 for j in range(n_words)]            
            sentences.append(sentence)
            
            #mapping the object unique id to the i            
            #The id is the object's memory address
            sentence_id_to_idx[id(sentence)] = i
            
        model = ModelWrapper(self, dataset, sentence_id_to_idx)
        dependencies = minibatch_parse(sentences, model, eval_batch_size)
                
        UAS = all_tokens = 0.0
        with tqdm(total=len(dataset)) as prog:
            for i, ex in enumerate(dataset):
                head = [-1] * len(ex['word'])
                for h, t, in dependencies[i]:
                    head[t] = h
                for pred_h, gold_h, gold_l, pos in \
                        zip(head[1:], ex['head'][1:], ex['dep'][1:], ex['pos'][1:]):
                        assert self.id2tok[pos].startswith(P_PREFIX)
                        pos_str = self.id2tok[pos][len(P_PREFIX):]
                        if (not self.punct(pos_str)):
                            UAS += 1 if pred_h == gold_h else 0
                            all_tokens += 1
                prog.update(i + 1)
        UAS /= all_tokens
        return UAS, dependencies

In [11]:
# Create a model wrapper
class ModelWrapper(object):
    def __init__(self, parser, dataset, sentence_id_to_idx):
        self.parser = parser
        self.dataset = dataset
        self.sentence_id_to_idx = sentence_id_to_idx

    def predict(self, partial_parses):
        mb_x = [self.parser.extract_features(p.stack, p.buffer, p.dep,
                                             self.dataset[self.sentence_id_to_idx[id(p.sentence)]])
                for p in partial_parses]
        mb_x = np.array(mb_x).astype('int32')
        mb_x = torch.from_numpy(mb_x).long()
        mb_l = [self.parser.legal_labels(p.stack, p.buffer) for p in partial_parses]

        pred = self.parser.model(mb_x)
        pred = pred.detach().numpy()
        
        #we need to multiply 10000 with legal labels, to force the model not to make any impossible prediction
        #other, when we parse sequentially, sometimes there is nothing in the buffer or stack, thus error....        
        pred = np.argmax(pred + 10000 * np.array(mb_l).astype('float32'), 1)
        pred = ["S" if p == 2 else ("LA" if p == 0 else "RA") for p in pred]
        
        return pred

In [12]:
# Create a simple function to create ids
def build_dict(keys, offset=0):
    count = Counter()
    for key in keys:
        count[key] += 1
    
    mc = count.most_common()
    
    return {w[0]: index + offset for (index, w) in enumerate(mc)}

In [13]:
# Test the parser
print("2. Building parser...",)
start = time.time()
parser = Parser(train_set)
print("took {:.2f} seconds".format(time.time() - start))

2. Building parser...
took 0.04 seconds


In [14]:
# Print the unnumericalized train set
print("Word: ",  train_set[1]['word'])
print("Pos:  ",  train_set[1]['pos'])
print("Head: ",  train_set[1]['head'])
print("Dep:  ",  train_set[1]['dep'])

Word:  ['ms.', 'haag', 'plays', 'elianti', '.']
Pos:   ['NNP', 'NNP', 'VBZ', 'NNP', '.']
Head:  [2, 3, 0, 3, 3]
Dep:   ['compound', 'nsubj', 'root', 'dobj', 'punct']


In [15]:
# Test the numericalize function
print("3. Numericalizing data...",)
start = time.time()
train_set = parser.numericalize(train_set)
dev_set   = parser.numericalize(dev_set)
test_set  = parser.numericalize(test_set)
print("took {:.2f} seconds".format(time.time() - start))

3. Numericalizing data...
took 0.07 seconds


In [16]:
# Print the word ids in the train set
train_set[1]['word']

[5156, 304, 1364, 1002, 2144, 87]

In [17]:
# Print the corresponding words
for i in train_set[1]['word']:
    print(parser.id2tok[i])

<ROOT>
ms.
haag
plays
elianti
.


In [18]:
# Print the pos ids in the train set
train_set[1]['pos']

[84, 42, 42, 55, 42, 46]

In [19]:
# Print the corresponding pos
for i in train_set[1]['pos']:
    print(parser.id2tok[i])

<p>:<ROOT>
<p>:NNP
<p>:NNP
<p>:VBZ
<p>:NNP
<p>:.


In [20]:
# Print the head ids in the train set
train_set[1]['head']

[-1, 2, 3, 0, 3, 3]

In [21]:
# Print the dependency ids in the train set
train_set[1]['dep']

[-1, 37, 30, 0, 23, 28]

### 4. Word Embedding

In [22]:
# Load pretrained word embeddings
print("4. Loading pretrained embeddings...",)
start = time.time()
word_vectors = {}
for line in open("data/en-cw.txt").readlines():
    we = line.strip().split() #we = word embeddings - first column: word;  the rest is embedding
    word_vectors[we[0]] = [float(x) for x in we[1:]] #{word: [list of 50 numbers], nextword: [another list], so on...}
    
#create an empty embedding matrix holding the embedding lookup table (vocab size, embed dim)
#we use random.normal instead of zeros, to keep the embedding matrix arbitrary in case word vectors don't exist....
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')

for token in parser.tok2id:
        i = parser.tok2id[token]
        if token in word_vectors:
            embeddings_matrix[i] = word_vectors[token]
        elif token.lower() in word_vectors:
            embeddings_matrix[i] = word_vectors[token.lower()]
print("Embedding matrix shape (vocab, emb size): ", embeddings_matrix.shape)
print("took {:.2f} seconds".format(time.time() - start))

4. Loading pretrained embeddings...
Embedding matrix shape (vocab, emb size):  (5157, 50)
took 2.44 seconds


### 5. Preprocessing

In [23]:
# Do preprocessing of the tranining data
print("5. Preprocessing training data...",)
start = time.time()
train_examples = parser.create_instances(train_set)
print("took {:.2f} seconds".format(time.time() - start))

5. Preprocessing training data...
took 1.36 seconds


### 6. Minibatch loader

In [24]:
# Create functions to get minibatches
def get_minibatches(data, minibatch_size, shuffle=True):
    data_size = len(data[0])
    indices = np.arange(data_size)
    if shuffle:
        np.random.shuffle(indices)
    for minibatch_start in np.arange(0, data_size, minibatch_size):
        minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
        yield [_minibatch(d, minibatch_indices) for d in data]

def _minibatch(data, minibatch_idx):
    return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]

def minibatches(data, batch_size):
    x = np.array([d[0] for d in data])
    y = np.array([d[2] for d in data])
    one_hot = np.zeros((y.size, 3))
    one_hot[np.arange(y.size), y] = 1
    return get_minibatches([x, one_hot], batch_size)

### 7. Neural Network

In [25]:
# Create the neural parser model
class ParserModel(nn.Module):

    def __init__(self, embeddings, n_features=48,
                 hidden_size=400, n_classes=3, dropout_prob=0.5):

        super(ParserModel, self).__init__()
        self.n_features   = n_features
        self.n_classes    = n_classes
        self.dropout_prob = dropout_prob
        self.embed_size   = embeddings.shape[1]
        self.hidden_size  = hidden_size
        self.pretrained_embeddings = nn.Embedding(embeddings.shape[0], self.embed_size)
        self.pretrained_embeddings.weight = nn.Parameter(torch.tensor(embeddings))

        self.embed_to_hidden = nn.Linear(n_features * self.embed_size, hidden_size)
        nn.init.xavier_uniform_(self.embed_to_hidden.weight, gain=1.)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.hidden_to_logits = nn.Linear(hidden_size, n_classes)
        nn.init.xavier_uniform_(self.hidden_to_logits.weight)

    def embedding_lookup(self, t):
        #t:  batch_size, n_features
        batch_size = t.size()[0]
                    
        x = self.pretrained_embeddings(t)        
        x = x.reshape(-1, self.n_features * self.embed_size)
        # x = (1024, 48 * 50)

        return x

    def forward(self, t):
        # t: (1024, 48)
        embeddings = self.embedding_lookup(t)  
    
        # embeddings: (1024, 48 * 50)
        hidden = self.embed_to_hidden(embeddings)
    
        # hidden: (1024, 200)
        hidden_activations = F.relu(hidden)
        # hidden_activations: (1024, 200)
        thin_net = self.dropout(hidden_activations)
        # thin_net: (1024, 200)
        logits = self.hidden_to_logits(thin_net)
        # logits: (1024, 3)

        return logits

In [26]:
# Crate a class to get the average
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [27]:
# Create functions for training
def train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005):
    
    best_dev_UAS = 0
    
    optimizer = optim.Adam(parser.model.parameters(), lr=0.001)
    loss_func = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        print("Epoch {:} out of {:}".format(epoch + 1, n_epochs))
        dev_UAS = train_for_epoch(
            parser, train_data, dev_data, optimizer, loss_func, batch_size)
        if dev_UAS > best_dev_UAS:
            best_dev_UAS = dev_UAS
            print("New best dev UAS! Saving model.")
            torch.save(parser.model.state_dict(), output_path)
        print("")


def train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size):
    
    parser.model.train()  # Places model in "train" mode, i.e. apply dropout layer
    n_minibatches = math.ceil(len(train_data) / batch_size)
    loss_meter = AverageMeter()

    with tqdm(total=(n_minibatches)) as prog:
        for i, (train_x, train_y) in enumerate(minibatches(train_data, batch_size)):
            
            #train_x:  batch_size, n_features
            #train_y:  batch_size, target(=3)
            
            optimizer.zero_grad() 
            loss = 0.
            train_x = torch.from_numpy(train_x).long()  #long() for int so embedding works....
            train_y = torch.from_numpy(train_y.nonzero()[1]).long()  #get the index with 1 because torch expects label to be single integer

            # Forward pass: compute predicted logits.
            logits = parser.model(train_x)
            # Compute loss
            loss = loss_func(logits, train_y)
            # Compute gradients of the loss w.r.t model parameters.
            loss.backward()
            # Take step with optimizer.
            optimizer.step()

            prog.update(1)
            loss_meter.update(loss.item())

    print("Average Train Loss: {}".format(loss_meter.avg))
    print("Evaluating on dev set",)
    parser.model.eval()  # Places model in "eval" mode, i.e. don't apply dropout layer
        
    dev_UAS, _ = parser.parse(dev_data)
    print("- dev UAS: {:.2f}".format(dev_UAS * 100.0))
    return dev_UAS

### 8. Training

In [28]:
#create directory if it does not exist for saving the weights...
output_dir = "output/all_features/"
output_path = output_dir + "model.weights"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
print(80 * "=")
print("TRAINING")
print(80 * "=")
    
model = ParserModel(embeddings_matrix)
parser.model = model

start = time.time()
train(parser, train_examples, dev_set, output_path,
      batch_size=1024, n_epochs=10, lr=0.0005)

TRAINING
Epoch 1 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.06it/s]


Average Train Loss: 1.0659268535673618
Evaluating on dev set


125250it [00:00, 7309030.62it/s]       


- dev UAS: 53.97
New best dev UAS! Saving model.

Epoch 2 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.35it/s]


Average Train Loss: 0.3523068067928155
Evaluating on dev set


125250it [00:00, 7327994.20it/s]       


- dev UAS: 61.78
New best dev UAS! Saving model.

Epoch 3 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.34it/s]


Average Train Loss: 0.27751495875418186
Evaluating on dev set


125250it [00:00, 7649604.31it/s]       


- dev UAS: 66.14
New best dev UAS! Saving model.

Epoch 4 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.01it/s]


Average Train Loss: 0.23797372821718454
Evaluating on dev set


125250it [00:00, 6594072.60it/s]       


- dev UAS: 69.30
New best dev UAS! Saving model.

Epoch 5 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.34it/s]


Average Train Loss: 0.2099525068576137
Evaluating on dev set


125250it [00:00, 6515398.44it/s]       


- dev UAS: 71.72
New best dev UAS! Saving model.

Epoch 6 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.29it/s]


Average Train Loss: 0.18636098535110554
Evaluating on dev set


125250it [00:00, 5953024.76it/s]       


- dev UAS: 72.07
New best dev UAS! Saving model.

Epoch 7 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.43it/s]


Average Train Loss: 0.16948170773684978
Evaluating on dev set


125250it [00:00, 8230116.65it/s]       


- dev UAS: 74.27
New best dev UAS! Saving model.

Epoch 8 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.46it/s]


Average Train Loss: 0.15627218472460905
Evaluating on dev set


125250it [00:00, 7195504.34it/s]       


- dev UAS: 74.38
New best dev UAS! Saving model.

Epoch 9 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.53it/s]


Average Train Loss: 0.14298919743547836
Evaluating on dev set


125250it [00:00, 7061355.13it/s]       


- dev UAS: 75.30
New best dev UAS! Saving model.

Epoch 10 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.72it/s]


Average Train Loss: 0.1328008707302312
Evaluating on dev set


125250it [00:00, 6221565.84it/s]       

- dev UAS: 75.30






### 9. Testing

In [29]:
print(80 * "=")
print("TESTING")
print(80 * "=")

print("Restoring the best model weights found on the dev set")
parser.model.load_state_dict(torch.load(output_path))
print("Final evaluation on test set",)
parser.model.eval()
UAS, dependencies = parser.parse(test_set)
print("- test UAS: {:.2f}".format(UAS * 100.0))
print("Done!")

TESTING
Restoring the best model weights found on the dev set
Final evaluation on test set


125250it [00:00, 8056197.40it/s]       

- test UAS: 76.13
Done!





### 10. Ablation Study

In [30]:
# Modify the original parser for the ablation study
# by adding options to include dep and pos as features
class NewParser(object):

    def __init__(self, dataset, dep_in=True, pos_in=True):
        
        #set the root dep
        self.root_dep = 'root'
                
        #get all the dep of the dataset as list, e.g., ['root', 'acl', 'nmod', 'nmod:npmod']
        all_dep = [self.root_dep] + list(set([w for ex in dataset
                                               for w in ex['dep']
                                               if w != self.root_dep]))
        
        #1. put dep into tok2id lookup table, with D_PREFIX so we know it is dependency
        #{'D_PREFIX:root': 0, 'D_PREFIX:acl': 1, 'D_PREFIX:nmod': 2, ..., 'D_PREFIX:<NULL>': 30}
        tok2id = {D_PREFIX + l: i for (i, l) in enumerate(all_dep)}
        tok2id[D_PREFIX + NULL] = self.D_NULL = len(tok2id)
        
        #we are using "unlabeled" where we do not label with the dependency
        #thus the number of dependency relation is 1
        trans = ['L', 'R', 'S']
        self.n_deprel = 1
        
        #create a simple lookup table mapping action and id
        #e.g., tran2id: {'L': 0, 'R': 1, 'S': 2}
        #e.g., id2tran: {0: 'L', 1: 'R', 2: 'S'}
        self.n_trans = len(trans)
        self.tran2id = {t: i for (i, t) in enumerate(trans)}
        self.id2tran = {i: t for (i, t) in enumerate(trans)}

        #2. put pos tags into tok2id lookup table, with P_PREFIX so we know it is pos
        tok2id.update(build_dict([P_PREFIX + w for ex in dataset for w in ex['pos']],
                                  offset=len(tok2id)))
        tok2id[P_PREFIX + UNK]  = self.P_UNK  = len(tok2id)  #also remember the pos tags of unknown
        tok2id[P_PREFIX + NULL] = self.P_NULL = len(tok2id)
        tok2id[P_PREFIX + ROOT] = self.P_ROOT = len(tok2id)
        
        #now tok2id:  {'P_PREFIX:root': 0, 'P_PREFIX:acl': 1, ..., 'P_PREFIX:JJR': 62, 'P_PREFIX:<UNK>': 63, 'P_PREFIX:<NULL>': 64, 'P_PREFIX:<ROOT>': 65}
        
        #3. put word into tok2id lookup table
        tok2id.update(build_dict([w for ex in dataset for w in ex['word']],
                                  offset=len(tok2id)))
        tok2id[UNK]  = self.UNK = len(tok2id)
        tok2id[NULL] = self.NULL = len(tok2id)
        tok2id[ROOT] = self.ROOT = len(tok2id)
        
        #now tok2id: {'D_PREFIX:root': 0, 'D_PREFIX:acl': 1, 'D_PREFIX:nmod': 2, ..., 'memory': 340, 'mr.': 341, '<UNK>': 342, '<NULL>': 343, '<ROOT>': 344}
        
        #create id2tok
        self.tok2id = tok2id
        self.id2tok = {v: k for (k, v) in tok2id.items()}
        
        #why 18 normal features + 18 (pos) + 12 (dep)
        #18 features - top 3 words on buffer, top 3 words on stack, 
        # the first and second left most/rightmost children of the top two words on the stack
        # the leftmost of leftmost/rightmost of rightmost children of the top two words on the stack
        #18 pos - basically corresponding POS tags
        #12 dep - corresponding ARC, excluding 6 words on hte stack/buffer..
        self.n_features = 18

        # modification for the ablation test 1
        self.dep_in = dep_in

        if self.dep_in == True:
            self.n_features += 12
        
        # modification for the ablation test 2
        self.pos_in = pos_in
        
        if self.pos_in == True:
            self.n_features += 18

        self.n_tokens = len(tok2id)
    
    #function to turn train set with words to train set with id instead using tok2id
    def numericalize(self, examples):
        numer_examples = []
        for ex in examples:
            word = [self.ROOT] + [self.tok2id[w] if w in self.tok2id
                                  else self.UNK for w in ex['word']]
            pos  = [self.P_ROOT] + [self.tok2id[P_PREFIX + w] if P_PREFIX + w in self.tok2id
                                   else self.P_UNK for w in ex['pos']]
            head = [-1] + ex['head']
            dep  = [-1] + [self.tok2id[D_PREFIX + w] if D_PREFIX + w in self.tok2id
                            else -1 for w in ex['dep']]
            numer_examples.append({'word': word, 'pos': pos,
                                 'head': head, 'dep': dep})
        return numer_examples

    #function to extract features to form a feature embedding matrix
    def extract_features(self, stack, buf, arcs, ex):
             
        #ex['word']:  [55, 32, 33, 34, 35, 30], i.e., ['root', 'ms.', 'haag', 'plays', 'elianti', '.']
        #ex['pos']:   [29, 14, 14, 16, 14, 17], i.e., ['NNP', 'NNP', 'VBZ', 'NNP', '.']
        #ex['head']:  [-1, 2, 3, 0, 3, 3]  or ['root', 'compound', 'nsubj', 'root', 'dobj', 'punct']}
        #ex['dep']:   [-1, 1, 2, 0, 6, 12] or ['compound', 'nsubj', 'root', 'dobj', 'punct']

        #stack     :  [0]
        #buffer    :  [1, 2, 3, 4, 5]
        
        if stack[0] == "ROOT":
            stack[0] = 0  #start the stack with [ROOT]

        #get leftmost children based on the dependency arcs
        def get_lc(k):
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] < k])

        #get right most children based on the dependency arcs
        def get_rc(k):
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] > k],
                          reverse=True)

        p_features = [] #pos features (2a, 2b, 2c) - 18
        d_features = [] #dep features (3b, 3c) - 12
        
        #last 3 things on the stack as features
        #if the stack is less than 3, then we simply append NULL from the left
        features = [self.NULL] * (3 - len(stack)) + [ex['word'][x] for x in stack[-3:]]
        
        # next 3 things on the buffer as features
        #if the buffer is less than 3, simply append NULL
        #the reason why NULL is appended on end because buffer is read left to right
        features += [ex['word'][x] for x in buf[:3]] + [self.NULL] * (3 - len(buf))
        
        #corresponding pos tags
        p_features = [self.P_NULL] * (3 - len(stack)) + [ex['pos'][x] for x in stack[-3:]]
        p_features += [ex['pos'][x] for x in buf[:3]] + [self.P_NULL] * (3 - len(buf))
        
        #get the leftmost and rightmost children of the top two words, thus we loop 2 times
        for i in range(2):
            if i < len(stack):
                k = stack[-i-1] #-1, -2 last two in the stack
                
                #the first and second lefmost/rightmost children of the top two words (i=1, 2) on the stack
                lc = get_lc(k)  
                rc = get_rc(k)
                
                #the leftmost of leftmost/rightmost of rightmost children of the top two words on the stack:
                llc = get_lc(lc[0]) if len(lc) > 0 else []
                rrc = get_rc(rc[0]) if len(rc) > 0 else []

                #(leftmost of first word on stack, rightmost of first word, 
                # leftmost of the second word on stack, rightmost of second, 
                # leftmost of leftmost, rightmost of rightmost
                features.append(ex['word'][lc[0]] if len(lc) > 0 else self.NULL)
                features.append(ex['word'][rc[0]] if len(rc) > 0 else self.NULL)
                features.append(ex['word'][lc[1]] if len(lc) > 1 else self.NULL)
                features.append(ex['word'][rc[1]] if len(rc) > 1 else self.NULL)
                features.append(ex['word'][llc[0]] if len(llc) > 0 else self.NULL)
                features.append(ex['word'][rrc[0]] if len(rrc) > 0 else self.NULL)

                #corresponding pos
                p_features.append(ex['pos'][lc[0]] if len(lc) > 0 else self.P_NULL)
                p_features.append(ex['pos'][rc[0]] if len(rc) > 0 else self.P_NULL)
                p_features.append(ex['pos'][lc[1]] if len(lc) > 1 else self.P_NULL)
                p_features.append(ex['pos'][rc[1]] if len(rc) > 1 else self.P_NULL)
                p_features.append(ex['pos'][llc[0]] if len(llc) > 0 else self.P_NULL)
                p_features.append(ex['pos'][rrc[0]] if len(rrc) > 0 else self.P_NULL)
            
                #corresponding dep
                d_features.append(ex['dep'][lc[0]] if len(lc) > 0 else self.D_NULL)
                d_features.append(ex['dep'][rc[0]] if len(rc) > 0 else self.D_NULL)
                d_features.append(ex['dep'][lc[1]] if len(lc) > 1 else self.D_NULL)
                d_features.append(ex['dep'][rc[1]] if len(rc) > 1 else self.D_NULL)
                d_features.append(ex['dep'][llc[0]] if len(llc) > 0 else self.D_NULL)
                d_features.append(ex['dep'][rrc[0]] if len(rrc) > 0 else self.D_NULL)
                
            else:
                #attach NULL when they don't exist
                features += [self.NULL] * 6
                p_features += [self.P_NULL] * 6
                d_features += [self.D_NULL] * 6

        if self.dep_in == True:     #modification for the ablation test 1
            features += d_features
        
        if self.pos_in == True:     #modification for the ablation test 2
            features += p_features
        
        assert len(features) == self.n_features
        return features

    #decide whether to shift, leftarc, or rightarc, based on gold parse trees
    #this is needed to create training examples which contain samples and ground truth
    def get_oracle(self, stack, buf, ex):

        #leave if the stack is only 1, thus nothing to predict....
        if len(stack) < 2:
            return self.n_trans - 1

        #predict based on the last two words on the stack
        i0 = stack[-1]
        i1 = stack[-2]

        #get the head and dependency
        h0 = ex['head'][i0]
        h1 = ex['head'][i1]
        d0 = ex['dep'][i0]
        d1 = ex['dep'][i1]

        #either shift, left arc or right arc
        #"Shift" = 2; "LA" = 0; "RA" = 1
        #if head of the second last word is the last word, then leftarc
        if (i1 > 0) and (h1 == i0):
            return 0
        #if head of the last word is the second last word, then rightarc
        #make sure nothing in the buffer has head with the last word on the stack
        #otherwise, we lose the last word.....
        elif (i1 >= 0) and (h0 == i1) and \
                (not any([x for x in buf if ex['head'][x] == i0])):
            return 1
        #otherwise shift, if something is left in buffer, otherwise, do nothing....
        else:
            return None if len(buf) == 0 else 2

    #generate training examples
    #from the training sentences and their gold parse trees 
    def create_instances(self, examples):
        all_instances = []
        
        for i, ex in enumerate(examples):
            #e.g., ex['word]: [344, 163, 99, 164, 165, 68]
            n_words = len(ex['word']) - 1  #excluding the root

            #arcs = {(head, tail, dependency label)}
            stack = [0]
            buf = [i + 1 for i in range(n_words)]  #[1, 2, 3, 4, 5]
            arcs = []
            instances = []
            
            #because that's the maximum number of shift, leftarcs, rightarcs you can have
            #this will determine the sample size of each training example
            #if given five words, we will get a sample of (10, 48) where 10 comes from 5 * 2, and 48 is n_features
            #but this for loop can be break if there is nothing left....
            for i in range(n_words * 2):

                #get the gold transition based on the parse trees
                #gold_t can be either shift(2), leftarc(0), or rightarc(1)
                gold_t = self.get_oracle(stack, buf, ex)
                
                #if gold_t is None, no need to extract features.....
                if gold_t is None:
                    break
                
                #make sure when the model predicts, we inform the current state of stack and buffer, so
                #the model is not allowed to make any illegal action, e.g., buffer is empty but trying to pop
                legal_labels = self.legal_labels(stack, buf)                
                assert legal_labels[gold_t] == 1
               
                #extract all the 48 features 
                features = self.extract_features(stack, buf, arcs, ex)
                instances.append((features, legal_labels, gold_t))
            
                #shift 
                if gold_t == 2:
                    stack.append(buf[0])
                    buf = buf[1:]
                #left arc 
                elif gold_t == 0:
                    arcs.append((stack[-1], stack[-2], gold_t))
                    stack = stack[:-2] + [stack[-1]]
                #right arc
                else:
                    arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel))
                    stack = stack[:-1]
            else:
                all_instances += instances

        return all_instances

    #provide an one hot encoding of the labels
    def legal_labels(self, stack, buf):
        labels =  ([1] if len(stack) > 2  else [0]) * self.n_deprel   #left arc   But cannot ROOT <----He thus 3
        labels += ([1] if len(stack) >= 2 else [0]) * self.n_deprel   #right arc  ROOT--->He
        labels += [1] if len(buf) > 0 else [0]   #shift
        return labels
    
    #a simple function to check punctuation POS tags
    def punct(self, pos):
        return pos in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]

    def parse(self, dataset, eval_batch_size=5000):
        sentences = []
        sentence_id_to_idx = {}
                
        for i, example in enumerate(dataset):
            
            #example['word']=[188, 186, 186, ..., 59]
            #n_words=37
            #sentence=[1, 2, 3, 4, 5,.., 37]
                        
            n_words = len(example['word']) - 1
            sentence = [j + 1 for j in range(n_words)]            
            sentences.append(sentence)
            
            #mapping the object unique id to the i            
            #The id is the object's memory address
            sentence_id_to_idx[id(sentence)] = i
            
        model = ModelWrapper(self, dataset, sentence_id_to_idx)
        dependencies = minibatch_parse(sentences, model, eval_batch_size)
                
        UAS = all_tokens = 0.0
        with tqdm(total=len(dataset)) as prog:
            for i, ex in enumerate(dataset):
                head = [-1] * len(ex['word'])
                for h, t, in dependencies[i]:
                    head[t] = h
                for pred_h, gold_h, gold_l, pos in \
                        zip(head[1:], ex['head'][1:], ex['dep'][1:], ex['pos'][1:]):
                        assert self.id2tok[pos].startswith(P_PREFIX)
                        pos_str = self.id2tok[pos][len(P_PREFIX):]
                        if (not self.punct(pos_str)):
                            UAS += 1 if pred_h == gold_h else 0
                            all_tokens += 1
                prog.update(i + 1)
        UAS /= all_tokens
        return UAS, dependencies

#### 10.1. Test 1 (Without Dep Features)

In [31]:
# Load data
train_set, dev_set, test_set = load_data()

# Build the parser
print("2. Building parser...",)
start = time.time()
parser = NewParser(train_set, dep_in=False)                    # ***WITHOUT DEP!!!***
print("took {:.2f} seconds".format(time.time() - start))

# Numericalize the data
print("3. Numericalizing data...",)
start = time.time()
train_set = parser.numericalize(train_set)
dev_set   = parser.numericalize(dev_set)
test_set  = parser.numericalize(test_set)
print("took {:.2f} seconds".format(time.time() - start))

# Load pretrained word embeddings
print("4. Loading pretrained embeddings...",)
start = time.time()
word_vectors = {}
for line in open("data/en-cw.txt").readlines():
    we = line.strip().split() #we = word embeddings - first column: word;  the rest is embedding
    word_vectors[we[0]] = [float(x) for x in we[1:]] #{word: [list of 50 numbers], nextword: [another list], so on...}
    
#create an empty embedding matrix holding the embedding lookup table (vocab size, embed dim)
#we use random.normal instead of zeros, to keep the embedding matrix arbitrary in case word vectors don't exist....
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')

for token in parser.tok2id:
        i = parser.tok2id[token]
        if token in word_vectors:
            embeddings_matrix[i] = word_vectors[token]
        elif token.lower() in word_vectors:
            embeddings_matrix[i] = word_vectors[token.lower()]
print("Embedding matrix shape (vocab, emb size): ", embeddings_matrix.shape)
print("took {:.2f} seconds".format(time.time() - start))

# Do preprocessing of the tranining data
print("5. Preprocessing training data...",)
start = time.time()
train_examples = parser.create_instances(train_set)
print("took {:.2f} seconds".format(time.time() - start))

# Training
output_dir = "output/without_dep/"
output_path = output_dir + "model.weights"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(80 * "=")
print("TRAINING")
print(80 * "=")
    
model = ParserModel(embeddings_matrix, n_features=36)       # ***WITHOUT DEP!!!***
parser.model = model

start = time.time()
train(parser, train_examples, dev_set, output_path,
      batch_size=1024, n_epochs=10, lr=0.0005)

# Testing
print(80 * "=")
print("TESTING")
print(80 * "=")

print("Restoring the best model weights found on the dev set")
parser.model.load_state_dict(torch.load(output_path))
print("Final evaluation on test set",)
parser.model.eval()
UAS, dependencies = parser.parse(test_set)
print("- test UAS: {:.2f}".format(UAS * 100.0))
print("Done!")

1. Loading data
2. Building parser...
took 0.03 seconds
3. Numericalizing data...
took 0.06 seconds
4. Loading pretrained embeddings...
Embedding matrix shape (vocab, emb size):  (5157, 50)
took 2.47 seconds
5. Preprocessing training data...
took 1.70 seconds
TRAINING
Epoch 1 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.88it/s]


Average Train Loss: 0.8118986872335275
Evaluating on dev set


125250it [00:00, 7587731.29it/s]       


- dev UAS: 52.27
New best dev UAS! Saving model.

Epoch 2 out of 10


100%|██████████| 48/48 [00:04<00:00, 11.27it/s]


Average Train Loss: 0.3421574408809344
Evaluating on dev set


125250it [00:00, 7128910.94it/s]       


- dev UAS: 60.54
New best dev UAS! Saving model.

Epoch 3 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.97it/s]


Average Train Loss: 0.265701597246031
Evaluating on dev set


125250it [00:00, 6610252.24it/s]       


- dev UAS: 64.81
New best dev UAS! Saving model.

Epoch 4 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.71it/s]


Average Train Loss: 0.2282435530796647
Evaluating on dev set


125250it [00:00, 6105654.00it/s]       


- dev UAS: 67.51
New best dev UAS! Saving model.

Epoch 5 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.03it/s]


Average Train Loss: 0.19723631255328655
Evaluating on dev set


125250it [00:00, 7481402.13it/s]       


- dev UAS: 69.27
New best dev UAS! Saving model.

Epoch 6 out of 10


100%|██████████| 48/48 [00:05<00:00,  9.33it/s]


Average Train Loss: 0.1745234007636706
Evaluating on dev set


125250it [00:00, 6304972.05it/s]       


- dev UAS: 70.73
New best dev UAS! Saving model.

Epoch 7 out of 10


100%|██████████| 48/48 [00:05<00:00,  9.26it/s]


Average Train Loss: 0.1584206932845215
Evaluating on dev set


125250it [00:00, 6321131.25it/s]       


- dev UAS: 72.06
New best dev UAS! Saving model.

Epoch 8 out of 10


100%|██████████| 48/48 [00:05<00:00,  9.01it/s]


Average Train Loss: 0.14116762625053525
Evaluating on dev set


125250it [00:00, 6588201.21it/s]       


- dev UAS: 72.31
New best dev UAS! Saving model.

Epoch 9 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.30it/s]


Average Train Loss: 0.12918514323731264
Evaluating on dev set


125250it [00:00, 6124017.30it/s]       


- dev UAS: 72.16

Epoch 10 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.51it/s]


Average Train Loss: 0.11910351381326716
Evaluating on dev set


125250it [00:00, 6157899.64it/s]       


- dev UAS: 73.63
New best dev UAS! Saving model.

TESTING
Restoring the best model weights found on the dev set
Final evaluation on test set


125250it [00:00, 6545923.89it/s]       

- test UAS: 75.34
Done!





#### 10.2. Test 2 (Without POS Features)

In [32]:
# Load data
train_set, dev_set, test_set = load_data()

# Build the parser
print("2. Building parser...",)
start = time.time()
parser = NewParser(train_set, pos_in=False)                    # ***WITHOUT POS!!!***
print("took {:.2f} seconds".format(time.time() - start))

# Numericalize the data
print("3. Numericalizing data...",)
start = time.time()
train_set = parser.numericalize(train_set)
dev_set   = parser.numericalize(dev_set)
test_set  = parser.numericalize(test_set)
print("took {:.2f} seconds".format(time.time() - start))

# Load pretrained word embeddings
print("4. Loading pretrained embeddings...",)
start = time.time()
word_vectors = {}
for line in open("data/en-cw.txt").readlines():
    we = line.strip().split() #we = word embeddings - first column: word;  the rest is embedding
    word_vectors[we[0]] = [float(x) for x in we[1:]] #{word: [list of 50 numbers], nextword: [another list], so on...}
    
#create an empty embedding matrix holding the embedding lookup table (vocab size, embed dim)
#we use random.normal instead of zeros, to keep the embedding matrix arbitrary in case word vectors don't exist....
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')

for token in parser.tok2id:
        i = parser.tok2id[token]
        if token in word_vectors:
            embeddings_matrix[i] = word_vectors[token]
        elif token.lower() in word_vectors:
            embeddings_matrix[i] = word_vectors[token.lower()]
print("Embedding matrix shape (vocab, emb size): ", embeddings_matrix.shape)
print("took {:.2f} seconds".format(time.time() - start))

# Do preprocessing of the tranining data
print("5. Preprocessing training data...",)
start = time.time()
train_examples = parser.create_instances(train_set)
print("took {:.2f} seconds".format(time.time() - start))

# Training
output_dir = "output/without_pos/"
output_path = output_dir + "model.weights"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(80 * "=")
print("TRAINING")
print(80 * "=")
    
model = ParserModel(embeddings_matrix, n_features=30)       # ***WITHOUT POS!!!***
parser.model = model

start = time.time()
train(parser, train_examples, dev_set, output_path,
      batch_size=1024, n_epochs=10, lr=0.0005)

# Testing
print(80 * "=")
print("TESTING")
print(80 * "=")

print("Restoring the best model weights found on the dev set")
parser.model.load_state_dict(torch.load(output_path))
print("Final evaluation on test set",)
parser.model.eval()
UAS, dependencies = parser.parse(test_set)
print("- test UAS: {:.2f}".format(UAS * 100.0))
print("Done!")

1. Loading data
2. Building parser...
took 0.03 seconds
3. Numericalizing data...
took 0.05 seconds
4. Loading pretrained embeddings...
Embedding matrix shape (vocab, emb size):  (5157, 50)
took 2.31 seconds
5. Preprocessing training data...
took 1.39 seconds
TRAINING
Epoch 1 out of 10


100%|██████████| 48/48 [00:03<00:00, 12.67it/s]


Average Train Loss: 0.852983079229792
Evaluating on dev set


125250it [00:00, 6235967.10it/s]       


- dev UAS: 47.78
New best dev UAS! Saving model.

Epoch 2 out of 10


100%|██████████| 48/48 [00:03<00:00, 12.55it/s]


Average Train Loss: 0.3903315644711256
Evaluating on dev set


125250it [00:00, 7361057.30it/s]       


- dev UAS: 54.54
New best dev UAS! Saving model.

Epoch 3 out of 10


100%|██████████| 48/48 [00:04<00:00, 10.39it/s]


Average Train Loss: 0.31724644017716247
Evaluating on dev set


125250it [00:00, 6252443.75it/s]       


- dev UAS: 57.49
New best dev UAS! Saving model.

Epoch 4 out of 10


100%|██████████| 48/48 [00:04<00:00, 11.91it/s]


Average Train Loss: 0.26963266264647245
Evaluating on dev set


125250it [00:00, 6657161.38it/s]       


- dev UAS: 59.67
New best dev UAS! Saving model.

Epoch 5 out of 10


100%|██████████| 48/48 [00:03<00:00, 12.13it/s]


Average Train Loss: 0.23663212669392428
Evaluating on dev set


125250it [00:00, 5799185.06it/s]       


- dev UAS: 61.56
New best dev UAS! Saving model.

Epoch 6 out of 10


100%|██████████| 48/48 [00:04<00:00, 11.88it/s]


Average Train Loss: 0.21107783789436022
Evaluating on dev set


125250it [00:00, 7151230.94it/s]       


- dev UAS: 64.21
New best dev UAS! Saving model.

Epoch 7 out of 10


100%|██████████| 48/48 [00:03<00:00, 12.10it/s]


Average Train Loss: 0.19108616715917984
Evaluating on dev set


125250it [00:00, 6428651.90it/s]       


- dev UAS: 63.82

Epoch 8 out of 10


100%|██████████| 48/48 [00:04<00:00, 11.89it/s]


Average Train Loss: 0.17482889629900455
Evaluating on dev set


125250it [00:00, 6785716.19it/s]       


- dev UAS: 65.66
New best dev UAS! Saving model.

Epoch 9 out of 10


100%|██████████| 48/48 [00:04<00:00, 11.54it/s]


Average Train Loss: 0.15816679767643413
Evaluating on dev set


125250it [00:00, 7760689.24it/s]       


- dev UAS: 65.61

Epoch 10 out of 10


100%|██████████| 48/48 [00:03<00:00, 12.12it/s]


Average Train Loss: 0.1458960835201045
Evaluating on dev set


125250it [00:00, 5213379.14it/s]       


- dev UAS: 65.53

TESTING
Restoring the best model weights found on the dev set
Final evaluation on test set


125250it [00:00, 7813439.07it/s]       

- test UAS: 67.07
Done!





### 11. Embedding Comparison

#### 11.1. Pretrained GloVe

In [33]:
# Load data
train_set, dev_set, test_set = load_data()

# Build the parser
print("2. Building parser...",)
start = time.time()
parser = Parser(train_set)
print("took {:.2f} seconds".format(time.time() - start))

# Numericalize the data
print("3. Numericalizing data...",)
start = time.time()
train_set = parser.numericalize(train_set)
dev_set   = parser.numericalize(dev_set)
test_set  = parser.numericalize(test_set)
print("took {:.2f} seconds".format(time.time() - start))

# Load GloVe embeddings with Gensim
print("4. Loading GloVe embeddings...",)
start = time.time()

from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.50d.txt')
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

#create an empty embedding matrix holding the embedding lookup table (vocab size, embed dim)
#we use random.normal instead of zeros, to keep the embedding matrix arbitrary in case word vectors don't exist....
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')

for token in parser.tok2id:
        i = parser.tok2id[token]
        if token in glove_model.index_to_key:
            embeddings_matrix[i] = glove_model[token]
        elif token.lower() in glove_model.index_to_key:
            embeddings_matrix[i] = glove_model[token.lower()]
print("Embedding matrix shape (vocab, emb size): ", embeddings_matrix.shape)
print("took {:.2f} seconds".format(time.time() - start))

# Do preprocessing of the tranining data
print("5. Preprocessing training data...",)
start = time.time()
train_examples = parser.create_instances(train_set)
print("took {:.2f} seconds".format(time.time() - start))

# Training
output_dir = "output/glove/"
output_path = output_dir + "model.weights"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(80 * "=")
print("TRAINING")
print(80 * "=")
    
model = ParserModel(embeddings_matrix)
parser.model = model

start = time.time()
train(parser, train_examples, dev_set, output_path,
      batch_size=1024, n_epochs=10, lr=0.0005)

# Testing
print(80 * "=")
print("TESTING")
print(80 * "=")

print("Restoring the best model weights found on the dev set")
parser.model.load_state_dict(torch.load(output_path))
print("Final evaluation on test set",)
parser.model.eval()
UAS, dependencies = parser.parse(test_set)
print("- test UAS: {:.2f}".format(UAS * 100.0))
print("Done!")

1. Loading data
2. Building parser...
took 0.03 seconds
3. Numericalizing data...
took 0.05 seconds
4. Loading GloVe embeddings...
Embedding matrix shape (vocab, emb size):  (5157, 50)
took 24.12 seconds
5. Preprocessing training data...
took 1.45 seconds
TRAINING
Epoch 1 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.26it/s]


Average Train Loss: 0.8925228640437126
Evaluating on dev set


125250it [00:00, 7733840.39it/s]       


- dev UAS: 51.26
New best dev UAS! Saving model.

Epoch 2 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.26it/s]


Average Train Loss: 0.3586074939618508
Evaluating on dev set


125250it [00:00, 6508940.35it/s]       


- dev UAS: 59.85
New best dev UAS! Saving model.

Epoch 3 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.44it/s]


Average Train Loss: 0.288917042935888
Evaluating on dev set


125250it [00:00, 6480836.12it/s]       


- dev UAS: 64.36
New best dev UAS! Saving model.

Epoch 4 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.19it/s]


Average Train Loss: 0.2500016773119569
Evaluating on dev set


125250it [00:00, 7656628.23it/s]       


- dev UAS: 65.29
New best dev UAS! Saving model.

Epoch 5 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.38it/s]


Average Train Loss: 0.2203830253953735
Evaluating on dev set


125250it [00:00, 7475014.95it/s]       


- dev UAS: 68.52
New best dev UAS! Saving model.

Epoch 6 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.29it/s]


Average Train Loss: 0.19789679317424694
Evaluating on dev set


125250it [00:00, 7887936.58it/s]       


- dev UAS: 69.60
New best dev UAS! Saving model.

Epoch 7 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.18it/s]


Average Train Loss: 0.17898793332278728
Evaluating on dev set


125250it [00:00, 10011559.77it/s]      


- dev UAS: 70.77
New best dev UAS! Saving model.

Epoch 8 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.13it/s]


Average Train Loss: 0.16705883666872978
Evaluating on dev set


125250it [00:00, 7066484.30it/s]       


- dev UAS: 72.19
New best dev UAS! Saving model.

Epoch 9 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.58it/s]


Average Train Loss: 0.1503782531556984
Evaluating on dev set


125250it [00:00, 7175454.85it/s]       


- dev UAS: 71.87

Epoch 10 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.15it/s]


Average Train Loss: 0.13739651177699366
Evaluating on dev set


125250it [00:00, 6126874.22it/s]       


- dev UAS: 72.80
New best dev UAS! Saving model.

TESTING
Restoring the best model weights found on the dev set
Final evaluation on test set


125250it [00:00, 6724566.40it/s]       

- test UAS: 72.81
Done!





#### 11.2. Skip-gram from Scratch

In [34]:
# Load the Brown Corpus from NLTK
import nltk
from nltk.corpus import brown

corpus = nltk.corpus.brown.sents()

print(corpus[:3])

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']]


In [35]:
# Convert the words in the corpus into lower case
corpus = [[word.lower() for word in sent] for sent in corpus]

print(corpus[:3])

[['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['the', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'city', 'executive', 'committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['the', 'september-october', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'mayor-nominate', 'ivan', 'allen', 'jr.', '.']]


In [36]:
# Remove stop words with SpaCy
import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

for sentence in corpus:
    for word in sentence[:]:
        if word in stopwords:
            sentence.remove(word)

print(corpus[:3])

[['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'evidence', "''", 'irregularities', 'took', 'place', '.'], ['jury', 'said', 'term-end', 'presentments', 'city', 'executive', 'committee', ',', 'over-all', 'charge', 'election', ',', '``', 'deserves', 'praise', 'thanks', 'city', 'atlanta', "''", 'manner', 'election', 'conducted', '.'], ['september-october', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', '``', 'irregularities', "''", 'hard-fought', 'primary', 'won', 'mayor-nominate', 'ivan', 'allen', 'jr.', '.']]


In [37]:
# Remove punctutations with String
import string
punctutations = string.punctuation

for sentence in corpus:
    for word in sentence[:]:
        if word in punctutations:
            sentence.remove(word)

print(corpus[:3])

[['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'evidence', "''", 'irregularities', 'took', 'place'], ['jury', 'said', 'term-end', 'presentments', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', '``', 'deserves', 'praise', 'thanks', 'city', 'atlanta', "''", 'manner', 'election', 'conducted'], ['september-october', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', '``', 'irregularities', "''", 'hard-fought', 'primary', 'won', 'mayor-nominate', 'ivan', 'allen', 'jr.']]


In [38]:
# Remove '``' and "''"
for sentence in corpus:
    for word in sentence[:]:
        if word == '``':
            sentence.remove(word)
        elif word == "''":
            sentence.remove(word)

print(corpus[:3])

[['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produced', 'evidence', 'irregularities', 'took', 'place'], ['jury', 'said', 'term-end', 'presentments', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', 'deserves', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conducted'], ['september-october', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', 'irregularities', 'hard-fought', 'primary', 'won', 'mayor-nominate', 'ivan', 'allen', 'jr.']]


In [39]:
# Get the unique words in the corpus
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus)))

In [40]:
# Create word2index dictionary
vocabs.append('<UNK>')
word2index = {'<UNK>': 0}

for v in vocabs:
    if word2index.get(v) is None:
        word2index[v] = len(word2index)

In [41]:
# Create a function to generate random batch of skipgrams
def random_batch(corpus, window_size, batch_size): 
    skip_grams = []
    for sentence in corpus:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = word2index[sentence[i]]
            context = []
            for j in range(window_size):
                if (i - window_size + j) >= 0:
                    context.append(word2index[sentence[i - window_size + j]])
            for k in range(1, window_size + 1):
                if (i + k) < len(sentence):
                    context.append(word2index[sentence[i + k]])
            for w in context:
                skip_grams.append([center, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])
        random_labels.append([skip_grams[i][1]])
            
    return np.array(random_inputs), np.array(random_labels)

In [42]:
# Test the random batch function
input_batch, target_batch = random_batch(corpus, 2, 4)

print("Input: ", input_batch)
print("Target: ", target_batch)

Input:  [[31023]
 [22592]
 [37854]
 [ 2271]]
Target:  [[40713]
 [ 4005]
 [28021]
 [43496]]


In [43]:
# Create the Skip-gram model
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, context_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)  #[batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words)  #[batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs)    #[batch_size, vocab_size, emb_size]
        
        scores      = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, vocab_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, vocab_size, 1] = [batch_size, vocab_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [44]:
# Create a function to convert indices to tensors
def prepare_sequence(vocabs, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], vocabs))
    return torch.LongTensor(idxs)

In [45]:
# Create a function to calculate training epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [46]:
# Set training parameters
window_size = 2
batch_size = 10
vocab_size = len(vocabs)
emb_size = 50
model = Skipgram(vocab_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [47]:
# Convert the indices of all words to tensors
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)

In [48]:
# Train the Skip-gram model for embedding
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch(corpus, window_size, batch_size)
    input_batch  = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | cost: 34.287952 | time: 7m 39s
Epoch: 200 | cost: 27.399923 | time: 15m 15s
Epoch: 300 | cost: 32.196110 | time: 23m 2s
Epoch: 400 | cost: 29.706690 | time: 30m 40s
Epoch: 500 | cost: 28.325581 | time: 38m 16s


In [49]:
# Create a function to get embeddings of words
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
    
    word = torch.LongTensor([index])

    center_embed  = model.embedding_v(word)
    context_embed = model.embedding_u(word)
    
    embed = (center_embed + context_embed) / 2
    
    return  embed[0].detach().numpy()

In [50]:
# Load data
train_set, dev_set, test_set = load_data()

# Build the parser
print("2. Building parser...",)
start = time.time()
parser = Parser(train_set)
print("took {:.2f} seconds".format(time.time() - start))

# Numericalize the data
print("3. Numericalizing data...",)
start = time.time()
train_set = parser.numericalize(train_set)
dev_set   = parser.numericalize(dev_set)
test_set  = parser.numericalize(test_set)
print("took {:.2f} seconds".format(time.time() - start))

# Load Skip-gram embeddings
print("4. Loading Skip-gram embeddings...",)
start = time.time() 

#create an empty embedding matrix holding the embedding lookup table (vocab size, embed dim)
#we use random.normal instead of zeros, to keep the embedding matrix arbitrary in case word vectors don't exist....
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')

for token in parser.tok2id:
        i = parser.tok2id[token]
        if token in vocabs:
            embeddings_matrix[i] = get_embed(token)
        elif token.lower() in vocabs:
            embeddings_matrix[i] = get_embed(token.lower())
print("Embedding matrix shape (vocab, emb size): ", embeddings_matrix.shape)
print("took {:.2f} seconds".format(time.time() - start))

# Do preprocessing of the tranining data
print("5. Preprocessing training data...",)
start = time.time()
train_examples = parser.create_instances(train_set)
print("took {:.2f} seconds".format(time.time() - start))

# Training
output_dir = "output/skipgram/"
output_path = output_dir + "model.weights"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(80 * "=")
print("TRAINING")
print(80 * "=")
    
model = ParserModel(embeddings_matrix)
parser.model = model

start = time.time()
train(parser, train_examples, dev_set, output_path,
      batch_size=1024, n_epochs=10, lr=0.0005)

# Testing
print(80 * "=")
print("TESTING")
print(80 * "=")

print("Restoring the best model weights found on the dev set")
parser.model.load_state_dict(torch.load(output_path))
print("Final evaluation on test set",)
parser.model.eval()
UAS, dependencies = parser.parse(test_set)
print("- test UAS: {:.2f}".format(UAS * 100.0))
print("Done!")

1. Loading data
2. Building parser...
took 0.05 seconds
3. Numericalizing data...
took 0.06 seconds
4. Loading Skip-gram embeddings...
Embedding matrix shape (vocab, emb size):  (5157, 50)
took 18.63 seconds
5. Preprocessing training data...
took 1.61 seconds
TRAINING
Epoch 1 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.32it/s]


Average Train Loss: 0.9576916539420685
Evaluating on dev set


125250it [00:00, 8629900.71it/s]       


- dev UAS: 48.82
New best dev UAS! Saving model.

Epoch 2 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.01it/s]


Average Train Loss: 0.37226903935273487
Evaluating on dev set


125250it [00:00, 8017590.40it/s]       


- dev UAS: 59.15
New best dev UAS! Saving model.

Epoch 3 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.37it/s]


Average Train Loss: 0.30115020585556823
Evaluating on dev set


125250it [00:00, 3960350.82it/s]       


- dev UAS: 62.45
New best dev UAS! Saving model.

Epoch 4 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.23it/s]


Average Train Loss: 0.25547634344547987
Evaluating on dev set


125250it [00:00, 8026042.35it/s]       


- dev UAS: 65.13
New best dev UAS! Saving model.

Epoch 5 out of 10


100%|██████████| 48/48 [00:06<00:00,  7.98it/s]


Average Train Loss: 0.22628249631573757
Evaluating on dev set


125250it [00:00, 8019303.85it/s]       


- dev UAS: 67.57
New best dev UAS! Saving model.

Epoch 6 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.27it/s]


Average Train Loss: 0.20265763780723015
Evaluating on dev set


125250it [00:00, 6609836.38it/s]       


- dev UAS: 68.77
New best dev UAS! Saving model.

Epoch 7 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.33it/s]


Average Train Loss: 0.18359983650346598
Evaluating on dev set


125250it [00:00, 13341542.46it/s]      


- dev UAS: 68.55

Epoch 8 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.30it/s]


Average Train Loss: 0.16783282533288002
Evaluating on dev set


125250it [00:00, 4041299.28it/s]       


- dev UAS: 70.62
New best dev UAS! Saving model.

Epoch 9 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.26it/s]


Average Train Loss: 0.1548461395626267
Evaluating on dev set


125250it [00:00, 7427876.65it/s]       


- dev UAS: 71.77
New best dev UAS! Saving model.

Epoch 10 out of 10


100%|██████████| 48/48 [00:05<00:00,  8.24it/s]


Average Train Loss: 0.1405178957308332
Evaluating on dev set


125250it [00:00, 8015877.69it/s]       


- dev UAS: 72.55
New best dev UAS! Saving model.

TESTING
Restoring the best model weights found on the dev set
Final evaluation on test set


125250it [00:00, 7773320.94it/s]       

- test UAS: 73.81
Done!





### 12. Dependency Test

#### 12.1. SpaCy

In [51]:
# Load data
train_set, dev_set, test_set = load_data()

1. Loading data


In [52]:
# Choose 3 short and simple sentences from the test set for testing
# Choose sentences with less than 10 words and without '``'
word_limit = 5
test_sentences = []

for sent in test_set:
    if len(sent['word']) <= word_limit and '``' not in sent['word']:
        test_sentences.append(sent)
    
    if len(test_sentences) == 3:
        break

print(test_sentences)

[{'word': ['the', 'market', 'crumbled', '.'], 'pos': ['DT', 'NN', 'VBD', '.'], 'head': [2, 3, 0, 3], 'dep': ['det', 'nsubj', 'root', 'punct']}, {'word': ['these', 'stocks', 'eventually', 'reopened', '.'], 'pos': ['DT', 'NNS', 'RB', 'VBD', '.'], 'head': [2, 4, 4, 0, 4], 'dep': ['det', 'nsubj', 'advmod', 'root', 'punct']}, {'word': ['but', 'stocks', 'kept', 'falling', '.'], 'pos': ['CC', 'NNS', 'VBD', 'VBG', '.'], 'head': [3, 3, 0, 3, 3], 'dep': ['cc', 'nsubj', 'root', 'xcomp', 'punct']}]


In [53]:
# Format the test sentences for SpaCy
formatted_sentences = [' '.join(sent['word']) for sent in test_sentences]

print(formatted_sentences)

['the market crumbled .', 'these stocks eventually reopened .', 'but stocks kept falling .']


In [54]:
# Use SpaCy to visualize dependency trees of the chosen sentences
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
options = {"collapse_punct": False}

for sent in formatted_sentences:
    displacy.render(nlp(sent), options = options, style="dep", jupyter=True)

#### 12.2. Chaky's Model

In [55]:
# Numericalize the test sentences for Chaky's Model
parser = Parser(train_set)

numericalized_sentences = [sent['word'] for sent in parser.numericalize(test_sentences)]

print(numericalized_sentences)

[[5156, 85, 174, 5154, 87], [5156, 239, 668, 537, 5154, 87], [5156, 124, 668, 1905, 5154, 87]]


In [56]:
# Recall Chaky's Model
word_vectors = {}
for line in open("data/en-cw.txt").readlines():
    we = line.strip().split()
    word_vectors[we[0]] = [float(x) for x in we[1:]]

embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')

for token in parser.tok2id:
        i = parser.tok2id[token]
        if token in word_vectors:
            embeddings_matrix[i] = word_vectors[token]
        elif token.lower() in word_vectors:
            embeddings_matrix[i] = word_vectors[token.lower()]

model = ParserModel(embeddings_matrix)
parser.model = model