<a href="https://colab.research.google.com/github/naokityokoyama/NLP_Embedding_Transformers/blob/main/NLP_treinando_embedding_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### import lib

In [None]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding, LSTM , GRU, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

### Coleta dos dados

In [None]:
train = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master/pt_bosque-ud-train.conllu'
dev = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master/pt_bosque-ud-dev.conllu'
test = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master/pt_bosque-ud-test.conllu'

In [None]:
cbow = '/content/drive/MyDrive/Dataset/cbow_s50.txt'

In [None]:
response = requests.get(dev)
url = response.text

In [None]:
class Coleta:
  def __init__(self, url):
    self.url = url
    
  def extrair(self, tamanho = len(url.split('\n'))):
    data_1 = []
    data_2 = []
    data_3 = []
    for i in tqdm(range(tamanho)):  #35443
      data1 = self.url.split('\n')[i].split('\t')
      try:
        if (data1[0][0])!= '#':
          data_1.append(data1[0])
          data_2.append(data1[2])
          data_3.append(data1[3])
      except:
        pass    
    return data_1, data_2, data_3

  def dataframe(self,  tamanho=len(url.split('\n'))):
      
    lista1 = self.extrair(tamanho=tamanho)[0]
    lista2 = self.extrair(tamanho=tamanho)[1]
    lista3 = self.extrair(tamanho=tamanho)[2]
    df = pd.concat([pd.DataFrame(lista1), pd.DataFrame(lista2), pd.DataFrame(lista3)], axis=1)       
    df.columns = ['id', 'token', 'POS_']
    return df

In [None]:
dados = Coleta(url)
df = dados.dataframe()

In [None]:
X = df['token'].values
y = df['POS_'].values

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

### Tokenizer

In [None]:
token = Tokenizer() 
token.fit_on_texts(X)              
X = token.texts_to_sequences(X)

voc_size = len(token.word_index) + 1

In [None]:
y_dummy = np_utils.to_categorical(y.tolist())
y = y_dummy

### Split train / test

In [None]:
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size=0.25)

In [None]:
len(X_train), len(y_train)

(22882, 22882)

In [None]:
X_train = pad_sequences(X_train, padding="post", maxlen=1)
X_test = pad_sequences(X_test, padding="post", maxlen=1)

In [None]:
print (X_train)

[[ 2]
 [ 2]
 [ 1]
 ...
 [14]
 [19]
 [ 0]]


In [None]:
y_train.shape, X_train.shape

((22882, 17), (22882, 1))

In [None]:
print (len(token.word_index)+1)
#tamanho do vocabulario no texto

### Train 

In [None]:
def neural(verbose=False, type_network=LSTM):
  model = Sequential()
  embedding = Embedding(input_dim=voc_size, output_dim=50, input_length = 1)
  model.add(embedding)
  model.add(type_network(units=10,  activation='relu')) 
  model.add(Dropout(0.1))
  model.add(Dense(units=y_train.shape[1], activation='softmax'))
  model.compile(loss='mean_squared_error', optimizer ='adam', metrics=['accuracy'] )
  model.summary()
  model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=True, validation_data=(X_test, y_test))
  return model

In [None]:
modelo = neural(type_network=GRU)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1, 50)             259700    
                                                                 
 gru (GRU)                   (None, 10)                1860      
                                                                 
 dropout_2 (Dropout)         (None, 10)                0         
                                                                 
 dense_2 (Dense)             (None, 17)                187       
                                                                 
Total params: 261,747
Trainable params: 261,747
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluate

In [None]:
loss, accuracy = modelo.evaluate(X_test, y_test)



In [None]:
loss, accuracy = modelo.evaluate(X_test, y_test)
print (loss, accuracy)

0.01696545071899891 0.7927373051643372


In [None]:
y_pred = modelo.predict(X_test)

In [None]:
np.argmax(y_pred[0])

7

In [None]:
confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

array([[ 183,    2,    2,    0,    0,    3,    0,  118,    0,    0,   18,
           0,    0,    0,    2,    0,    0],
       [   0, 1026,    2,    0,    2,    0,    0,    3,    1,    1,    0,
           0,    1,    0,    0,    0,    0],
       [   2,    6,  204,    0,    1,    1,    0,   22,    0,    3,    1,
           0,    0,    0,    0,    0,    0],
       [   0,    0,    0,  153,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   17,    0,    0],
       [   0,    0,    1,    0,  158,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [   0,    0,    8,    0,    0, 1045,    0,    1,    1,    7,    0,
           0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    0,
           0,    0,    0,    0,    0,    0],
       [  28,    1,    5,    0,    0,    0,    0, 1262,    0,    0,   39,
           0,    0,    0,    5,    0,    0],
       [   0,    0,    0,    0,    0,   15,    0