<a href="https://colab.research.google.com/github/maxhormazabal/depencendy_parsing/blob/main/p2_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

## Installation of dependencies

In [1]:
!pip install conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Connecting to Google Drive to read the `.py` file that contains the functions to use.

In [2]:
# Getting access to Google Drive files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import our own functions (they are in a .py file on Google Drive)
import os
os.chdir("/content/drive/MyDrive/MASTER")
from nlu_preprocessing_utils import *
from conllu import parse
import tensorflow as tf
import pandas as pd
import numpy as np
import math
import io

## Preprocessing in One Step

In [None]:
base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/'
file_basename = 'en_partut-ud'
stack_len = 4
buffer_len = 8

(path,x_train_token,action_encod_train,deprel_encod_train,x_test_token,action_encod_test,deprel_encod_test,x_val_token,action_encod_val,deprel_encod_val) = preprocessingOneStep(base_url,file_basename,stack_len,buffer_len)
!mkdir -p {path}
saveData(path,x_train_token,action_encod_train,deprel_encod_train,x_test_token,action_encod_test,deprel_encod_test,x_val_token,action_encod_val,deprel_encod_val)

Data sucessfully saved on ./ nlu_data/4stack8buffer


In [None]:
base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/'
file_basename = 'en_partut-ud'
stack_len = 5
buffer_len = 5

(path,x_train_token,action_encod_train,deprel_encod_train,x_test_token,action_encod_test,deprel_encod_test,x_val_token,action_encod_val,deprel_encod_val) = preprocessingOneStep(base_url,file_basename,stack_len,buffer_len)
!mkdir -p {path}
saveData(path,x_train_token,action_encod_train,deprel_encod_train,x_test_token,action_encod_test,deprel_encod_test,x_val_token,action_encod_val,deprel_encod_val)

Data sucessfully saved on ./ nlu_data/5stack5buffer


In [None]:
base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/'
file_basename = 'en_partut-ud'
stack_len = 7
buffer_len = 10

(path,x_train_token,action_encod_train,deprel_encod_train,x_test_token,action_encod_test,deprel_encod_test,x_val_token,action_encod_val,deprel_encod_val) = preprocessingOneStep(base_url,file_basename,stack_len,buffer_len)
!mkdir -p {path}
saveData(path,x_train_token,action_encod_train,deprel_encod_train,x_test_token,action_encod_test,deprel_encod_test,x_val_token,action_encod_val,deprel_encod_val)

Data sucessfully saved on ./ nlu_data/7stack10buffer


## Reading datasource from its origin on github

In [4]:
# English
# 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu'
# https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu

base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/'
file_basename = 'en_partut-ud'
(en_train,en_test,en_val) = readConlluDataset(base_url,file_basename)
en_upo2number, en_number2upo, en_nupos = getUposList(en_train)
number2action,action2number = getActionDict()

In [5]:
!rm nlu_data/original_test.conllu
generateConlluForTesting()

Original file generated in nlu_data/original_test_line.conllu


## Transforming the data source into the initial dataset

In [6]:
train_df = conlluToDatasetForDependency(en_train,en_upo2number)
test_df = conlluToDatasetForDependency(en_test,en_upo2number)
val_df = conlluToDatasetForDependency(en_val,en_upo2number)

### Checking Projective Arcs

In [7]:
train_df = train_df.iloc[projectiveArcs(train_df)]
test_df = test_df.iloc[projectiveArcs(test_df)]
val_df = val_df.iloc[projectiveArcs(val_df)]

In [8]:
train_df

Unnamed: 0,id,form,head,deprel,upos
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[Distribution, of, this, license, does, not, c...","[7, 4, 4, 1, 7, 7, 0, 12, 12, 9, 9, 7, 7]","[nsubj, case, det, nmod, aux, advmod, root, de...","[1, 2, 3, 1, 4, 5, 6, 3, 1, 7, 1, 1, 7]"
1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[Creative, Commons, provides, this, informatio...","[3, 1, 0, 5, 3, 13, 13, 11, 11, 9, 13, 11, 3, 3]","[nsubj, flat, root, det, obj, case, det, punct...","[8, 8, 6, 3, 1, 2, 3, 7, 2, 7, 6, 7, 1, 7]"
2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Creative, Commons, makes, no, warranties, reg...","[3, 1, 0, 5, 3, 5, 8, 6, 8, 12, 12, 3, 12, 15,...","[nsubj, flat, root, det, obj, acl, det, obj, a...","[8, 8, 6, 3, 1, 6, 3, 1, 6, 7, 9, 6, 1, 2, 1, ..."
3,"[1, 2]","[License, .]","[0, 1]","[root, punct]","[1, 7]"
4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[The, work, is, protected, by, copyright, and,...","[2, 4, 4, 0, 6, 4, 12, 9, 7, 12, 12, 6, 4]","[det, nsubj:pass, aux:pass, root, case, obl, c...","[3, 1, 4, 6, 2, 1, 9, 7, 9, 10, 10, 1, 7]"
...,...,...,...,...,...
1776,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[From, the, 18, th, century, ,, the, desire, f...","[5, 5, 5, 3, 13, 5, 8, 13, 12, 12, 12, 8, 0, 1...","[case, det, nummod, amod, obl, punct, det, nsu...","[2, 3, 13, 10, 1, 7, 3, 1, 2, 10, 8, 1, 6, 1, ..."
1777,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[That, demand, also, led, to, the, production,...","[2, 4, 4, 0, 7, 7, 4, 11, 11, 11, 7, 18, 18, 1...","[det, nsubj, advmod, root, case, det, obl, cas...","[3, 1, 11, 6, 2, 3, 1, 2, 10, 10, 1, 7, 11, 11..."
1778,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Shakespeare, 's, works, include, the, 36, pla...","[3, 1, 4, 0, 7, 7, 4, 7, 12, 12, 12, 8, 14, 12...","[nmod, case, nsubj, root, det, nummod, obj, ac...","[8, 5, 1, 6, 3, 13, 1, 6, 2, 3, 10, 1, 2, 13, ..."
1779,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Two, plays, not, included, in, the, First, Fo...","[2, 23, 4, 2, 8, 8, 8, 4, 2, 13, 13, 13, 2, 15...","[nummod, nsubj:pass, advmod, acl, case, det, a...","[13, 1, 5, 6, 2, 3, 10, 1, 7, 3, 13, 10, 1, 9,..."


## Training the Tokenizer

In [9]:
from keras.preprocessing.text import Tokenizer

text = "root"

for sentence in train_df['form']:
  for word in sentence:
    text = text + " " + word

tokenizer = Tokenizer(oov_token="<OOV>",filters="") 
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index

#save
with open('nlu_data/tokenizer.json', 'w') as outfile:
    outfile.write(tokenizer.to_json())

## Post-Oracle datasets

In [10]:
stack_len = 3
buffer_len = 5
(x_train,action_train,deprel_train) = transformByOracle(train_df,stack_len,buffer_len,en_nupos)
(x_test,action_test,deprel_test) = transformByOracle(test_df,stack_len,buffer_len,en_nupos)
(x_val,action_val,deprel_val) = transformByOracle(val_df,stack_len,buffer_len,en_nupos)

## X-Variables Tokenization

In [11]:
x_train_token = applyTokenizer(x_train,stack_len,buffer_len,tokenizer)
x_test_token = applyTokenizer(x_test,stack_len,buffer_len,tokenizer)
x_val_token = applyTokenizer(x_val,stack_len,buffer_len,tokenizer)

## Y-Variables encoding

In [12]:
deprel_train,number2deprel_train,deprel2number_train = deprelToNumerical(deprel_train)
deprel_test,number2deprel_test,deprel2number_test = deprelToNumerical(deprel_test)
deprel_val,number2deprel_val,deprel2number_val = deprelToNumerical(deprel_val)

action_encod_train = tf.keras.utils.to_categorical(action_train)
deprel_encod_train = tf.keras.utils.to_categorical(deprel_train)
action_encod_test = tf.keras.utils.to_categorical(action_test)
deprel_encod_test = tf.keras.utils.to_categorical(deprel_test)
action_encod_val = tf.keras.utils.to_categorical(action_val)
deprel_encod_val = tf.keras.utils.to_categorical(deprel_val)

### Making lengths of deprel equal between sets. Using the maximum value of deprels

In [13]:
max_len = max([deprel_encod_train.shape[1],deprel_encod_test.shape[1],deprel_encod_val.shape[1]])

deprel_encod_train = tf.keras.utils.pad_sequences(deprel_encod_train,maxlen=max_len,padding='post')
deprel_encod_test = tf.keras.utils.pad_sequences(deprel_encod_test,maxlen=max_len,padding='post')
deprel_encod_val = tf.keras.utils.pad_sequences(deprel_encod_val,maxlen=max_len,padding='post')

Creating folder to save the data depending of the size of stack and buffer

In [None]:
folder_name = str(stack_len)+"stack"+str(buffer_len)+"buffer"
path = "nlu_data/"+folder_name
!mkdir -p {path}

In [None]:
# function to save the numpy array as a file and re-use it avoiding preprocessing steps
# It will be save in your current directory (seted in "os.chdir("/content/drive/MyDrive/MASTER")")

# Saving train data
np.save(path+'/x_train.npy', x_train_token) 
np.save(path+'/action_train.npy', action_encod_train)
np.save(path+'/deprel_train.npy', deprel_encod_train)

# Saving test data
np.save(path+'/x_test.npy', x_test_token) 
np.save(path+'/action_test.npy', action_encod_test)
np.save(path+'/deprel_test.npy', deprel_encod_test)

# Saving val data
np.save(path+'/x_val.npy', x_val_token) 
np.save(path+'/action_val.npy', action_encod_val)
np.save(path+'/deprel_val.npy', deprel_encod_val)

# With the following you can read the file and create the numpy array again
# new_x_data = np.load('x_data.npy',allow_pickle=True)
# new_action_data = np.load('action_data.npy',allow_pickle=True)