In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import urllib.request
import gensim
import gensim.downloader as gloader


from zipfile import ZipFile
from collections import OrderedDict
from typing import List, Callable, Dict
from tqdm import tqdm


# Create Dataset

## Download data

In [None]:
dataset_folder = os.path.join(os.getcwd(), "Datasets", "Original")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "data.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

## Create Dataframe

In [None]:
train_range = (1, 101)
val_range = (101, 151)
test_range = (151, 200)

split_sentences = True

dataframe_rows = []
with ZipFile(dataset_path, 'r') as myzip:
    for i, filename in enumerate(myzip.namelist()[1:]):
        print("Extracting", filename, end='\r')

        with myzip.open(filename) as myfile:
            file_id = int(filename.split('.')[0][-4:])

            split = 'train'
            if file_id in range(*val_range):
                split = 'val'
            elif file_id in range(*test_range):
                split = 'test'

            content_string = myfile.read().decode('utf-8')
            if split_sentences:
                sentences = content_string.split('\n\n')
            else:
                sentences = [content_string]

            for sentence in sentences:
                content = sentence.split('\n')
                content = [line.split('\t') for line in content if len(line.split('\t')) == 3]

                words, tags, _ = zip(*content)

                dataframe_rows.append({'file_id': file_id,
                                       'text': ' '.join(words),
                                       'tags': tags,
                                       'split': split
                                       })

df = pd.DataFrame(dataframe_rows).sort_values('file_id').reset_index(drop=True)
print("Dataframe created.".ljust(50))

df

## Preprocessing

Convert to lowercase

In [None]:
df['text'] = df['text'].apply(lambda x: x.lower())
df

## Data Splitting

In [None]:
train_data = df[df['split'] == 'train']
val_data = df[df['split'] == 'val']
test_data = df[df['split'] == 'test']

x_train = train_data['text'].values
y_train = train_data['tags'].values

x_val = val_data['text'].values
y_val = val_data['tags'].values

x_test = test_data['text'].values
y_test = test_data['tags'].values

print('Dataset splits statistics: ')
print(f'Train data: {x_train.shape}')
print(f'Validation data: {x_val.shape}')
print(f'Test data: {x_test.shape}')


## Apply GloVe embeddings

In [None]:
from utils.kerasTokenizer import KerasTokenizer

In [None]:
embedding_dimension = 50
tokenizer = KerasTokenizer(tokenizer_args=None, #tokenizer_args,
                           add_oov_terms=True,
                           build_embedding_matrix=True,
                           embedding_dimension=embedding_dimension,
                           embedding_model_type="glove")

In [None]:
tokenizer.build_vocab(x_train)
print(f'Tokenizer info: {tokenizer.get_info()}')

In [None]:
tokenizer.build_vocab(x_val)
print(f'Tokenizer info: {tokenizer.get_info()}')

In [None]:
tokenizer.build_vocab(x_test)
print(f'Tokenizer info: {tokenizer.get_info()}')
