In [40]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import urllib.request
import gensim
import gensim.downloader as gloader


from zipfile import ZipFile
from collections import OrderedDict
from typing import List, Callable, Dict
from tqdm import tqdm


# Create Dataset

## Download data

In [3]:
dataset_folder = os.path.join(os.getcwd(), "Datasets", "Original")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "data.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

## Create Dataframe

In [4]:
train_range = (1, 101)
val_range = (101, 151)
test_range = (151, 200)

split_sentences = True

dataframe_rows = []
with ZipFile(dataset_path, 'r') as myzip:
    for i, filename in enumerate(myzip.namelist()[1:]):
        print("Extracting", filename, end='\r')

        with myzip.open(filename) as myfile:
            file_id = int(filename.split('.')[0][-4:])

            split = 'train'
            if file_id in range(*val_range):
                split = 'val'
            elif file_id in range(*test_range):
                split = 'test'

            content_string = myfile.read().decode('utf-8')
            if split_sentences:
                sentences = content_string.split('\n\n')
            else:
                sentences = [content_string]

            for sentence in sentences:
                content = sentence.split('\n')
                content = [line.split('\t') for line in content if len(line.split('\t')) == 3]

                words, tags, _ = zip(*content)

                dataframe_rows.append({'file_id': file_id,
                                       'text': ' '.join(words),
                                       'tags': tags,
                                       'split': split
                                       })

df = pd.DataFrame(dataframe_rows).sort_values('file_id').reset_index(drop=True)
print("Dataframe created.".ljust(50))

df

Dataframe created.                                


Unnamed: 0,file_id,text,tags,split
0,1,"Pierre Vinken , 61 years old , will join the b...","(NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...",train
1,1,"Mr. Vinken is chairman of Elsevier N.V. , the ...","(NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...",train
2,2,"Rudolph Agnew , 55 years old and former chairm...","(NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...",train
3,3,`` There 's no question that some of those wor...,"(``, EX, VBZ, DT, NN, IN, DT, IN, DT, NNS, CC,...",train
4,3,Workers described `` clouds of blue dust '' th...,"(NNS, VBD, ``, NNS, IN, JJ, NN, '', WDT, VBD, ...",train
...,...,...,...,...
3909,198,A line-item veto is a procedure that would all...,"(DT, JJ, NN, VBZ, DT, NN, WDT, MD, VB, DT, NN,...",test
3910,198,Sen. Kennedy said in a separate statement that...,"(NNP, NNP, VBD, IN, DT, JJ, NN, IN, PRP, VBZ, ...",test
3911,199,Trinity Industries Inc. said it reached a prel...,"(NNP, NNPS, NNP, VBD, PRP, VBD, DT, JJ, NN, TO...",test
3912,199,Terms were n't disclosed .,"(NNS, VBD, RB, VBN, .)",test


## Preprocessing

Convert to lowercase

In [5]:
df['text'] = df['text'].apply(lambda x: x.lower())
df

Unnamed: 0,file_id,text,tags,split
0,1,"pierre vinken , 61 years old , will join the b...","(NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...",train
1,1,"mr. vinken is chairman of elsevier n.v. , the ...","(NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...",train
2,2,"rudolph agnew , 55 years old and former chairm...","(NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...",train
3,3,`` there 's no question that some of those wor...,"(``, EX, VBZ, DT, NN, IN, DT, IN, DT, NNS, CC,...",train
4,3,workers described `` clouds of blue dust '' th...,"(NNS, VBD, ``, NNS, IN, JJ, NN, '', WDT, VBD, ...",train
...,...,...,...,...
3909,198,a line-item veto is a procedure that would all...,"(DT, JJ, NN, VBZ, DT, NN, WDT, MD, VB, DT, NN,...",test
3910,198,sen. kennedy said in a separate statement that...,"(NNP, NNP, VBD, IN, DT, JJ, NN, IN, PRP, VBZ, ...",test
3911,199,trinity industries inc. said it reached a prel...,"(NNP, NNPS, NNP, VBD, PRP, VBD, DT, JJ, NN, TO...",test
3912,199,terms were n't disclosed .,"(NNS, VBD, RB, VBN, .)",test


## Data Splitting

In [6]:
train_data = df[df['split'] == 'train']
val_data = df[df['split'] == 'val']
test_data = df[df['split'] == 'test']

x_train = train_data['text'].values
y_train = train_data['tags'].values

x_val = val_data['text'].values
y_val = val_data['tags'].values

x_test = test_data['text'].values
y_test = test_data['tags'].values

print('Dataset splits statistics: ')
print(f'Train data: {x_train.shape}')
print(f'Validation data: {x_val.shape}')
print(f'Test data: {x_test.shape}')


Dataset splits statistics: 
Train data: (1963,)
Validation data: (1299,)
Test data: (652,)


## Apply GloVe embeddings and Tokenization

In [7]:
def get_oov_embedding(word, embedding_model, size):
    """For now just a random vector, can be changed to a more sophisticated method."""
    return np.random.uniform(low=-0.05, high=0.05, size=size)

In [8]:
from utils.kerasTokenizer import load_embedding_model, check_OOV_terms

print("Loading GloVe embedding.")
my_embedding_dimension = 50
my_embedding_model = load_embedding_model('glove', my_embedding_dimension)

Loading GloVe embedding.


In [17]:
print("GLOVE vocabulary (V1) size: ", len(my_embedding_model))

x_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[UNK]')

print("Creating V2 using training set (V1 + OOV1)")
x_tokenizer.fit_on_texts(x_train)
for word in tqdm(check_OOV_terms(my_embedding_model, x_tokenizer.word_index.keys())):
    embedding_vector = get_oov_embedding(word=word, embedding_model=my_embedding_model, size=my_embedding_dimension)
    my_embedding_model.__setitem__(word, embedding_vector)

print("V2 size: ", len(my_embedding_model))

print("Creating V3 using validation set (V2 + OOV2)")
x_tokenizer.fit_on_texts(x_val)
for word in tqdm(check_OOV_terms(my_embedding_model, x_tokenizer.word_index.keys())):
    embedding_vector = get_oov_embedding(word=word, embedding_model=my_embedding_model, size=my_embedding_dimension)
    my_embedding_model.__setitem__(word, embedding_vector)

print("V3 size: ", len(my_embedding_model))

print("Creating V4 using validation set (V3 + OOV3)")
x_tokenizer.fit_on_texts(x_test)
for word in tqdm(check_OOV_terms(my_embedding_model, x_tokenizer.word_index.keys())):
    embedding_vector = get_oov_embedding(word=word, embedding_model=my_embedding_model, size=my_embedding_dimension)
    my_embedding_model.__setitem__(word, embedding_vector)

print("V4 size: ", len(my_embedding_model))

GLOVE vocabulary (V1) size:  400166
Creating V2 using training set (V1 + OOV1)


0it [00:00, ?it/s]


V2 size:  400166
Creating V3 using validation set (V2 + OOV2)


0it [00:00, ?it/s]

V3 size:  400166
Creating V4 using validation set (V3 + OOV3)



0it [00:00, ?it/s]

V4 size:  400166





In [10]:
# small variant, using x_tokenizer for indexing
embedding_matrix = np.zeros((len(x_tokenizer.word_index), my_embedding_dimension))
for i, word in enumerate(x_tokenizer.word_index.keys()):
    embedding_matrix[i] = my_embedding_model.get_vector(word)


# large variant, using embedding model for indexing
# embedding_matrix = np.zeros((len(my_embedding_model), my_embedding_dimension))
# for word, i in my_embedding_model.key_to_index.items():
#     embedding_matrix[i] = my_embedding_model.get_vector(word)


In [11]:
tags_s = ' '.join([' '.join(y) for y in df['tags']])
pd.DataFrame(tags_s.split())[0].unique()

array(['NNP', ',', 'CD', 'NNS', 'JJ', 'MD', 'VB', 'DT', 'NN', 'IN', '.',
       'VBZ', 'VBG', 'CC', 'VBD', 'VBN', '``', 'EX', "''", 'WDT', 'RB',
       'RP', 'TO', 'WRB', 'RBR', 'VBP', 'JJR', 'WP', 'JJS', 'PRP', ':',
       'POS', 'PRP$', '$', 'NNPS', 'WP$', '-LRB-', '-RRB-', 'PDT', 'RBS',
       'FW', 'UH', 'SYM', 'LS', '#'], dtype=object)

In [12]:
y_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[UNK]', filters='!"%&()*+/;<=>?@[\\]^_{|}~\t\n', lower=False)
y_tokenizer.fit_on_texts([' '.join(y) for y in df['tags']])

In [13]:
print(y_tokenizer.word_index.keys())
print(len(y_tokenizer.word_index.keys()))

dict_keys(['[UNK]', 'NN', 'IN', 'NNP', 'DT', 'NNS', 'JJ', ',', '.', 'CD', 'VBD', 'RB', 'VB', 'CC', 'TO', 'VBN', 'VBZ', 'PRP', 'VBG', 'VBP', 'MD', 'POS', 'PRP$', '$', '``', "''", ':', 'WDT', 'JJR', 'NNPS', 'WP', 'RP', 'JJS', 'WRB', 'RBR', '-RRB-', '-LRB-', 'EX', 'RBS', 'PDT', '#', 'WP$', 'LS', 'FW', 'UH', 'SYM'])
46


## Padding

In [41]:
maxlen = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(df['text']), padding="post")[0].size

x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(x_train), maxlen=maxlen,padding="post")
x_val_pad = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(x_val), maxlen=maxlen,padding="post")
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(x_test), maxlen=maxlen,padding="post")