<a href="https://colab.research.google.com/github/maxhormazabal/depencendy_parsing/blob/main/p2_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocesamiento

## Instalación de dependencias

In [None]:
!pip install conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.2


## Conectando con Google Drive para leer el archivo `.py` que contiene las funciones a utilizar.

In [None]:
# Getting access to Google Drive files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import our own functions (they are in a .py file on Google Drive)
import os
os.chdir("/content/drive/MyDrive/MASTER")
from nlu_preprocessing_utils import *
from conllu import parse
import tensorflow as tf
import pandas as pd
import numpy as np
import math

## Leyendo fuente de datos desde su origen en github

In [None]:
# English
# 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu'
# https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu

base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/'
file_basename = 'en_partut-ud'
(en_train,en_test,en_val) = readConlluDataset(base_url,file_basename)
en_upo2number, en_number2upo, en_nupos = getUposList(en_train)
number2action,action2number = getActionDict()

Downloading data from https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu
Downloading data from https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-test.conllu
Downloading data from https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-dev.conllu
Total of different UPOS:  17
{'NOUN': 1, 'ADP': 2, 'DET': 3, 'AUX': 4, 'PART': 5, 'VERB': 6, 'PUNCT': 7, 'PROPN': 8, 'CCONJ': 9, 'ADJ': 10, 'ADV': 11, 'PRON': 12, 'NUM': 13, 'SCONJ': 14, 'X': 15, 'INTJ': 16, 'SYM': 17}


## Transformando la fuente de datos en el dataset inicial

In [None]:
train_df = conlluToDatasetForDependency(en_train,en_upo2number)
test_df = conlluToDatasetForDependency(en_test,en_upo2number)
val_df = conlluToDatasetForDependency(en_val,en_upo2number)

### Checking Projective Arcs

In [None]:
train_df = train_df.iloc[projectiveArcs(train_df)]
test_df = test_df.iloc[projectiveArcs(test_df)]
val_df = val_df.iloc[projectiveArcs(val_df)]

## Entrenando Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer

text = "root"

for sentence in train_df['form']:
  for word in sentence:
    text = text + " " + word

tokenizer = Tokenizer(oov_token="<OOV>",filters="") 
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index

## Post-Oracle datasets

In [None]:
stack_len = 7
buffer_len = 10
(x_train,action_train,deprel_train) = transformByOracle(train_df,stack_len,buffer_len,en_nupos)
(x_test,action_test,deprel_test) = transformByOracle(test_df,stack_len,buffer_len,en_nupos)
(x_val,action_val,deprel_val) = transformByOracle(val_df,stack_len,buffer_len,en_nupos)

## X-Variables Tokenization

In [None]:
x_train_token = applyTokenizer(x_train,stack_len,buffer_len,tokenizer)
x_test_token = applyTokenizer(x_test,stack_len,buffer_len,tokenizer)
x_val_token = applyTokenizer(x_val,stack_len,buffer_len,tokenizer)

## Y-Variables encoding

In [None]:
deprel_train,number2deprel_train,deprel2number_train = deprelToNumerical(deprel_train)
deprel_test,number2deprel_test,deprel2number_test = deprelToNumerical(deprel_test)
deprel_val,number2deprel_val,deprel2number_val = deprelToNumerical(deprel_val)

action_encod_train = tf.keras.utils.to_categorical(action_train)
deprel_encod_train = tf.keras.utils.to_categorical(deprel_train)
action_encod_test = tf.keras.utils.to_categorical(action_test)
deprel_encod_test = tf.keras.utils.to_categorical(deprel_test)
action_encod_val = tf.keras.utils.to_categorical(action_val)
deprel_encod_val = tf.keras.utils.to_categorical(deprel_val)

### Making lengths of deprel equal between sets. Using the maximum value of deprels

In [None]:
max_len = max([deprel_encod_train.shape[1],deprel_encod_test.shape[1],deprel_encod_val.shape[1]])

deprel_encod_train = tf.keras.utils.pad_sequences(deprel_encod_train,maxlen=max_len,padding='post')
deprel_encod_test = tf.keras.utils.pad_sequences(deprel_encod_test,maxlen=max_len,padding='post')
deprel_encod_val = tf.keras.utils.pad_sequences(deprel_encod_val,maxlen=max_len,padding='post')

In [None]:
print(deprel_encod_train.shape,deprel_encod_test.shape,deprel_encod_val.shape)

(86430, 41) (6854, 41) (5434, 41)


Creating folder to save the data depending of the size of stack and buffer

In [None]:
folder_name = str(stack_len)+"stack"+str(buffer_len)+"buffer"
path = "nlu_data/"+folder_name
!mkdir -p {path}

In [None]:
# function to save the numpy array as a file and re-use it avoiding preprocessing steps
# It will be save in your current directory (seted in "os.chdir("/content/drive/MyDrive/MASTER")")

# Saving train data
np.save(path+'/x_train.npy', x_train_token) 
np.save(path+'/action_train.npy', action_encod_train)
np.save(path+'/deprel_train.npy', deprel_encod_train)

# Saving test data
np.save(path+'/x_test.npy', x_test_token) 
np.save(path+'/action_test.npy', action_encod_test)
np.save(path+'/deprel_test.npy', deprel_encod_test)

# Saving val data
np.save(path+'/x_val.npy', x_val_token) 
np.save(path+'/action_val.npy', action_encod_val)
np.save(path+'/deprel_val.npy', deprel_encod_val)

In [None]:
# With the following you can read the file and create the numpy array again
new_x_data = np.load('x_data.npy',allow_pickle=True)
new_action_data = np.load('action_data.npy',allow_pickle=True)