<a href="https://colab.research.google.com/github/maxhormazabal/depencendy_parsing/blob/main/P2_Preprocessing_NLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocesamiento

## Instalación de dependencias

In [1]:
!pip install conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.2


## Conectando con Google Drive para leer el archivo `.py` que contiene las funciones a utilizar.

In [2]:
# Getting access to Google Drive files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import our own functions (they are in a .py file on Google Drive)
import os
os.chdir("/content/drive/MyDrive/MASTER")
from nlu_preprocessing_utils import *
from conllu import parse
import tensorflow as tf
import pandas as pd
import numpy as np
import math

## Leyendo fuente de datos desde su origen en github

In [4]:
# English
# 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu'
# https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu

base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/'
file_basename = 'en_partut-ud'
(en_train,en_test,en_val) = readConlluDataset(base_url,file_basename)
en_upo2number, en_number2upo, en_nupos = getUposList(en_train)

Downloading data from https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu
Downloading data from https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-test.conllu
Downloading data from https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-dev.conllu
Total of different UPOS:  17
{'NOUN': 1, 'ADP': 2, 'DET': 3, 'AUX': 4, 'PART': 5, 'VERB': 6, 'PUNCT': 7, 'PROPN': 8, 'CCONJ': 9, 'ADJ': 10, 'ADV': 11, 'PRON': 12, 'NUM': 13, 'SCONJ': 14, 'X': 15, 'INTJ': 16, 'SYM': 17}


## Transformando la fuente de datos en el dataset inicial

In [5]:
train_df = conlluToDatasetForDependency(en_train,en_upo2number)
train_df

Unnamed: 0,id,form,head,deprel,upos
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[Distribution, of, this, license, does, not, c...","[7, 4, 4, 1, 7, 7, 0, 12, 12, 9, 9, 7, 7]","[nsubj, case, det, nmod, aux, advmod, root, de...","[1, 2, 3, 1, 4, 5, 6, 3, 1, 7, 1, 1, 7]"
1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[Creative, Commons, provides, this, informatio...","[3, 1, 0, 5, 3, 13, 13, 11, 11, 9, 13, 11, 3, 3]","[nsubj, flat, root, det, obj, case, det, punct...","[8, 8, 6, 3, 1, 2, 3, 7, 2, 7, 6, 7, 1, 7]"
2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Creative, Commons, makes, no, warranties, reg...","[3, 1, 0, 5, 3, 5, 8, 6, 8, 12, 12, 3, 12, 15,...","[nsubj, flat, root, det, obj, acl, det, obj, a...","[8, 8, 6, 3, 1, 6, 3, 1, 6, 7, 9, 6, 1, 2, 1, ..."
3,"[1, 2]","[License, .]","[0, 1]","[root, punct]","[1, 7]"
4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[The, work, is, protected, by, copyright, and,...","[2, 4, 4, 0, 6, 4, 12, 9, 7, 12, 12, 6, 4]","[det, nsubj:pass, aux:pass, root, case, obl, c...","[3, 1, 4, 6, 2, 1, 9, 7, 9, 10, 10, 1, 7]"
...,...,...,...,...,...
1776,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[From, the, 18, th, century, ,, the, desire, f...","[5, 5, 5, 3, 13, 5, 8, 13, 12, 12, 12, 8, 0, 1...","[case, det, nummod, amod, obl, punct, det, nsu...","[2, 3, 13, 10, 1, 7, 3, 1, 2, 10, 8, 1, 6, 1, ..."
1777,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[That, demand, also, led, to, the, production,...","[2, 4, 4, 0, 7, 7, 4, 11, 11, 11, 7, 18, 18, 1...","[det, nsubj, advmod, root, case, det, obl, cas...","[3, 1, 11, 6, 2, 3, 1, 2, 10, 10, 1, 7, 11, 11..."
1778,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Shakespeare, 's, works, include, the, 36, pla...","[3, 1, 4, 0, 7, 7, 4, 7, 12, 12, 12, 8, 14, 12...","[nmod, case, nsubj, root, det, nummod, obj, ac...","[8, 5, 1, 6, 3, 13, 1, 6, 2, 3, 10, 1, 2, 13, ..."
1779,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Two, plays, not, included, in, the, First, Fo...","[2, 23, 4, 2, 8, 8, 8, 4, 2, 13, 13, 13, 2, 15...","[nummod, nsubj:pass, advmod, acl, case, det, a...","[13, 1, 5, 6, 2, 3, 10, 1, 7, 3, 13, 10, 1, 9,..."


In [6]:
(x_data,action_data) = transformByOracle(train_df,2,3)

# function to save the numpy array as a file and re-use it avoiding preprocessing steps
np.save('x_data.npy', x_data) # It will be save in your current directory (seted in "os.chdir("/content/drive/MyDrive/MASTER")")
np.save('action_data.npy', action_data)


# With the following you can read the file and create the numpy array again
new_x_data = np.load('x_data.npy',allow_pickle=True)
new_action_data = np.load('action_data.npy',allow_pickle=True)

In [30]:
action_data[1:10]

array([['Shift', 'None'],
       ['Shift', 'None'],
       ['Left Arc', 'det'],
       ['Left Arc', 'case'],
       ['Right Arc', 'nmod'],
       ['Shift', 'None'],
       ['Shift', 'None'],
       ['Left Arc', 'advmod'],
       ['Left Arc', 'aux']], dtype='<U32')