In [6]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from pathlib import Path
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [7]:
def file_iterator():
    data_dir = Path('dependency_treebank')
    for data_file in filter(lambda f: os.path.isfile(data_dir/f) and f.endswith('.dp'), os.listdir(data_dir)):
        yield data_dir/data_file

In [8]:
train_set = []
test_set = []
val_set = []

train_split = 100
val_split = 150

file_counter = 0
for file in file_iterator():
    file_counter += 1
    if file_counter <= train_split:
        train_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    elif file_counter <= val_split:
        val_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    else:
        test_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))

assert len(train_set) == 100
assert len(val_set) == 50
assert len(test_set) == 49

In [9]:
train_frame = pd.concat(train_set)
test_frame = pd.concat(test_set)
val_frame = pd.concat(val_set)

assert sum([e.shape[0] for e in train_set]) == train_frame.shape[0]
assert sum([e.shape[0] for e in test_set]) == test_frame.shape[0]
assert sum([e.shape[0] for e in val_set]) == val_frame.shape[0]

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Tokenizer.from_pretrained('bert-base-uncased')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_frame['token'].values)
vocab_size = len(tokenizer.word_index) + 1

encoded_doc = tokenizer.texts_to_sequences(train_frame['token'].values)

# Not necessary since we encode words and not sentences 
max_length = 1
padded_docs = pad_sequences(encoded_doc, maxlen=max_length, padding='post')

In [12]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [19]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [18]:
# One hot encode train_frame['pos'].values
encoder = LabelEncoder()
encoder.fit(train_frame['pos'].values)
encoded_Y = encoder.transform(train_frame['pos'].values)
# convert integers to dummy variables (i.e. one hot encoded)
labels = np_utils.to_categorical(encoded_Y)



In [37]:
from tensorflow import keras
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=1, trainable=False)
model.add(e)
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128), input_shape=(None, 50)))
model.add(Dense(44, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 1, 100)            632700    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              234496    
 l)                                                              
                                                                 
 dense_9 (Dense)             (None, 44)                11308     
                                                                 
Total params: 878,504
Trainable params: 245,804
Non-trainable params: 632,700
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epo