# Introdução
Este notebook é relativo a etapa de pré-processamento do projeto TED MCTI.

## Importando as Libs

In [2]:
!pip install translators --upgrade
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting translators
  Downloading translators-5.4.2-py3-none-any.whl (29 kB)
Collecting loguru>=0.6.0
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 2.2 MB/s 
[?25hCollecting pathos>=0.2.9
  Downloading pathos-0.2.9-py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 3.3 MB/s 
[?25hCollecting PyExecJS>=1.5.1
  Downloading PyExecJS-1.5.1.tar.gz (13 kB)
Collecting requests>=2.28.1
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.3 MB/s 
[?25hCollecting ppft>=1.7.6.5
  Downloading ppft-1.7.6.5-py2.py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 514 kB/s 
[?25hCollecting pox>=0.3.1
  Downloading pox-0.3.1-py2.py3-none-any.whl (28 kB)
Collecting multiprocess>=0.70.13
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[

In [3]:
import pandas as pd
import requests

import math
import time
import re
import contractions
import unicodedata
import translators as ts

import numpy as np
from numpy import array
import tensorflow as tf 
from tensorflow import keras

import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from keras import backend as K

Using state South Carolina server backend.


In [None]:
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
nltk.download('stopwords')

## Funções de treinamento

In [42]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [63]:
import json
import io
import shutil

def write_list(a_list, file_name):
    with open(file_name, "w") as fp:
        json.dump(a_list, fp)

def read_list(url):
    myfile = requests.get(url)
    myfile.raise_for_status()
    n_list = json.load(io.BytesIO(myfile.content))
    return n_list

def read_labels(url):
    response = requests.get(url)
    response.raise_for_status()
    data = np.load(io.BytesIO(response.content))
    return np.array(data)

def get_model(filename, url):
    response = requests.get(url, stream=True)
    with open(filename, 'wb') as fin:
        shutil.copyfileobj(response.raw, fin)

In [78]:
def train_network(word_list, labels, save_as='best weights.h5'):
  vocab = []
  for sentence in word_list:
    for word in sentence:
      if word not in vocab:
        vocab.append(word)

  vocab_size = len(set(vocab))
  input_vector = []
  i = 0
  for sentence in word_list:
    input_vector.append([one_hot(word, vocab_size, filters='') for word in sentence])

  max_size = 0
  for sentence in input_vector:
    if len(sentence) > max_size:
      max_size = len(sentence)

  input_vector = pad_sequences(input_vector, maxlen=max_size, padding='pre')

  # Split dataset into train and test data
  x_train, x_test, y_train, y_test = train_test_split(input_vector, 
                                                      labels, 
                                                      test_size=0.20, 
                                                      random_state=20)
  
  # Creating the Network
  model_NN = Sequential() 
  model_NN.add(Embedding(vocab_size, 8, input_length=len(input_vector[0])))
  model_NN.add(Flatten())
  model_NN.add(Dense(1, activation='relu'))

  # add checkpoint to save the network and stop if training doesn't improve MCTI
  checkpoint = keras.callbacks.ModelCheckpoint(save_as, monitor='val_accuracy', verbose=1, 
                              save_best_only=True, mode='max')
  callbacks_list = [checkpoint]

  model_NN.compile(optimizer='adam', loss='binary_crossentropy', 
                        metrics=['accuracy', f1_m, precision_m, recall_m])

  # Fiting  the model
  history = model_NN.fit(x_train, y_train, epochs=100, 
                        callbacks=callbacks_list, verbose=2, 
                        validation_data=(x_test, y_test), 
                        batch_size=128)

  # Evaluate the model
  [modelloss, modelaccuracy, 
  modelf1, modelprecision, 
  modelrecall] = model_NN.evaluate(x_test, y_test, verbose=0)

  #plot_history(history)
  return modelf1, (x_test, y_test)


# Dataset
Leitura dos dados MCTI

In [4]:
url= 'https://github.com/chap0lin/PPF-MCTI/blob/master/Datasets/oportunidadesrotulo%20-%20%C3%9Altima.xlsx?raw=true'
myfile = requests.get(url)
dataMCTI = pd.read_excel(myfile.content)

print("MCTI Dataset has " + str(dataMCTI.shape[0]) + " examples with " + str(dataMCTI.shape[1]) + " columns of information")

MCTI Dataset has 928 examples with 21 columns of information


In [None]:
dataMCTI.head()

Unnamed: 0.2,Ordem,Unnamed: 0,Unnamed: 0.1,atualizacao,codigo,link,opo_deadline,opo_texto,opo_texto_ele,opo_tipo,...,opo_dificuldade,rotulagem,opo_brazil,classificado,Tamanho do Texto,Tamanho doTítulo,Quantidade de Sentenças no Texto,Média Sentença,QTD Branco,opo_texto.1
0,0,0,0,220329,dfg_220329_1_000,https://www.dfg.de/en/research_funding/announc...,,"The Deutsche Forschungsgemeinschaft (DFG, Germ...","The Deutsche Forschungsgemeinschaft (DFG, Germ...",other,...,1.0,vitor,Y,S,3487,123,34,102,455,"The Deutsche Forschungsgemeinschaft (DFG, Germ..."
1,1,1,1,220329,dfg_220329_1_001,https://www.dfg.de/en/research_funding/announc...,second,"In March 2018, the Senate of the Deutsche Fors...","In March 2018, the Senate of the Deutsche Fors...",other,...,1.0,vitor,N,S,7280,92,67,108,945,"In March 2018, the Senate of the Deutsche Fors..."
2,2,2,2,220329,dfg_220329_1_002,https://www.dfg.de/en/research_funding/announc...,JSPS-DFG 2022,JSPS-DFG 2022,JSPS-DFG 2022,other,...,1.0,vitor,N,S,13,127,0,0,1,JSPS-DFG 2022
3,3,3,3,220329,dfg_220329_1_003,https://www.dfg.de/en/research_funding/announc...,########,"The Deutsche Forschungsgemeinschaft (DFG, Germ...","The Deutsche Forschungsgemeinschaft (DFG, Germ...",grant,...,1.0,vitor,N,S,5325,71,60,88,642,"The Deutsche Forschungsgemeinschaft (DFG, Germ..."
4,4,4,4,220329,dfg_220329_1_004,https://www.dfg.de/en/research_funding/announc...,,Within the current funding initiative on next ...,Within the current funding initiative on next ...,grant,...,1.0,vitor,N,S,7278,28,77,94,1002,Within the current funding initiative on next ...


In [61]:
labels = dataMCTI['opo_brazil']
labelsMCTI = np.where(labels == "Y", 1, 0)

# Base - Baseline

## Formatando opo_texto e opo_texto_ele

A partir dos dados raspados e catalogados no projeto, pode-se perceber uma divergencia em alguns dos dados contidos na coluna opo_texto e opo_texto_ele. Para realizar o trabalho de classificação precisaramos utilizar apenas uma base de texto, e para isso foram definidas as seguintes regras:



1.   Se: `opo_texto` IGUAL `opo_texto_ele` => Utilizamos `opo_texto`
2.   Se: `opo_texto` DIFERENTE `opo_texto_ele` E `opo_texto_ele` IGUAL "nan" => Utilizamos `opo_texto`
3.   Se: `opo_texto` DIFERENTE `opo_texto_ele` E `opo_texto_ele` DIFERENTE "nan" E n_tokens(`opo_texto`) < 4000 => Utilizamos `opo_texto` + `opo_texto_ele`
4.   Outros casos => Utilizamos `opo_texto`



**Nota:** dos 928 dados de treino:

*   795 obedecem a regra 1
*   18 obedecem a regra 2
*   114 obedecem a regra 3
*   1 obedece a regra 4



In [None]:
opo_texto_data = dataMCTI['opo_texto']
opo_texto_ele_data = dataMCTI['opo_texto_ele']

In [None]:
def n_tokens(text):
  return len(nltk.word_tokenize(text))

In [None]:
count_regra_1 = 0
count_regra_2 = 0
count_regra_3 = 0
count_regra_4 = 0
opo_texto_final = []

for i in range(len(opo_texto_data)):
  if opo_texto_data[i] == opo_texto_ele_data[i]:
    count_regra_1+=1
    opo_texto_final.append(opo_texto_data[i])
  elif pd.isna(opo_texto_ele_data[i]):
    count_regra_2+=1
    opo_texto_final.append(opo_texto_data[i])
  elif n_tokens(opo_texto_data[i]) < 4000:
    count_regra_3+=1
    opo_texto_final.append(opo_texto_data[i]+". "+opo_texto_ele_data[i])
  else:
    count_regra_4+=1
    opo_texto_final.append(opo_texto_data[i])

print(count_regra_1)
print(count_regra_2)
print(count_regra_3)
print(count_regra_4)

## Removendo caracteres especiais

In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
opo_texto_sem_caracteres_especiais = []
for opo in opo_texto_final:
  opo_texto_sem_caracteres_especiais.append(remove_accented_chars(opo))

print(opo_texto_final[1])
print(opo_texto_sem_caracteres_especiais[1])

## Traduzindo para Inglês

In [None]:
google_max_input_limit = 5000
def translate_with_google(text):
  if len(text) < google_max_input_limit:
    return ts.google(text, to_language="en")
  else:
    sentences = nltk.sent_tokenize(text)
    output = ""
    for sentence in sentences:
      output = output + ts.google(sentence, to_language="en")
    return output

In [None]:
opo_texto_traduzido = []
for i, opo in enumerate(opo_texto_sem_caracteres_especiais):
  if dataMCTI["opo_titulo"][i][:4] == "Bols":
    opo_texto_traduzido.append(translate_with_google(opo))
  else:
    opo_texto_traduzido.append(opo)

print(opo_texto_sem_caracteres_especiais[450])
print(opo_texto_traduzido[450])

## Tokenizando

In [None]:
vocab = []
for sentence in sentencesMCTIList:
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size = len(set(vocab))
print(vocab_size)

23784


In [None]:
sentencesMCTIList_base = []
for sentence in opo_texto_traduzido:
  sentencesMCTIList_base.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_base[450])

['OPPORTUNITY', 'FOR', 'A', '(', '01', ')', 'TECHNICAL', 'TRS', 'Training', 'Vacancy', 'TRS', '(', 'TT-III', ')', 'with', 'FAPESP', 'Scholarship', 'in', 'the', 'Young', 'Researcher', 'IMMODULATION', 'PROJECT', 'OF', 'Iron', 'Homeostasis', 'and', 'regularity', 'of', 'the', 'Signification', 'of', 'Tyrosine', 'Quinase', 'TAM', 'During', 'infection', 'by', 'Mycobacterium', 'tuberculosis', ':', 'targets', 'for', 'development', 'of', 'host-targeted', 'immunopharmacological', 'therapies', ',', 'developed', 'at', 'the', 'Department', 'of', 'Bioquica', 'and', 'Immunology', ',', 'Ribeiro', 'Preto', 'Medical', 'School', 'of', 'the', 'University', 'of', 'So', 'Paulo', '(', 'FMRP-USP', ')', ',', 'under', 'the', 'coordination', 'of', 'Dr.', 'Diego', 'Lus', 'Costa', '.', 'The', 'selected', 'candidate', 'assist', 'in', 'the', 'maintenance', 'and', 'genotyping', 'of', 'transgenic', 'mice', 'colnies', ',', 'reagent', 'and', 'consumable', 'monitoring', 'and', 'monitoring', 'and', 'technician', 'support',

## Treinando a rede

In [None]:
i = 0
best_i = 0
best_f1_base = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_base, test_data = train_network(sentencesMCTIList_base, labelsMCTI, 
                              "best weights base-" + str(i) + ".h5")
  if current_f1_base > best_f1_base:
    best_f1_base = current_f1_base
    best_i = i
  i+=1
print("Best base data preprocessing F1 Score: " + str(best_f1_base))
print(best_i)

# Experimentos de pré-processamento
Condução de experimentos com o objetivo de encontrar a combinação de técnicas de pré-processamento que entreguem os melhores resultados.

## Pontuação e Capitalização

### 1. Expandir Contrações

In [None]:
sentencesExpanded = []
for sentence in opo_texto_traduzido:
  sentencesExpanded.append(contractions.fix(sentence))
print(sentencesExpanded[450])

OPPORTUNITY FOR A (01) TECHNICAL TRS Training Vacancy TRS (TT-III) with FAPESP Scholarship in the Young Researcher IMMODULATION PROJECT OF IRON HOMEOSTY AND REGISTRATION OF THE SIGNAL TYPERSINE SIGNAL QUINASINE TAM DURING MYCOBACTERIUM TUBERCULOSIS: TREATS FOR DEVELOPMENT of host-targeted immunopharmacological therapies, developed at the Department of Bioquica and Immunology, Ribeiro Preto Medical School of the University of So Paulo (FMRP-USP), under the coordination of Dr. Diego Lus Costa. The selected candidate assist in the maintenance and genotyping of transgenic mice colnies, reagent and consumable monitoring and monitoring and technician support for laboratory. @USP.br.More information about requirements and benefits of FAPESP TT-III Scholarship is at fapesp.br/3098 and fapesp.br/3162.


In [None]:
sentencesMCTIList_xp1 = []
for sentence in sentencesExpanded:
  sentencesMCTIList_xp1.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp1[450])

['OPPORTUNITY', 'FOR', 'A', '(', '01', ')', 'TECHNICAL', 'TRS', 'Training', 'Vacancy', 'TRS', '(', 'TT-III', ')', 'with', 'FAPESP', 'Scholarship', 'in', 'the', 'Young', 'Researcher', 'IMMODULATION', 'PROJECT', 'OF', 'IRON', 'HOMEOSTY', 'AND', 'REGISTRATION', 'OF', 'THE', 'SIGNAL', 'TYPERSINE', 'SIGNAL', 'QUINASINE', 'TAM', 'DURING', 'MYCOBACTERIUM', 'TUBERCULOSIS', ':', 'TREATS', 'FOR', 'DEVELOPMENT', 'of', 'host-targeted', 'immunopharmacological', 'therapies', ',', 'developed', 'at', 'the', 'Department', 'of', 'Bioquica', 'and', 'Immunology', ',', 'Ribeiro', 'Preto', 'Medical', 'School', 'of', 'the', 'University', 'of', 'So', 'Paulo', '(', 'FMRP-USP', ')', ',', 'under', 'the', 'coordination', 'of', 'Dr.', 'Diego', 'Lus', 'Costa', '.', 'The', 'selected', 'candidate', 'assist', 'in', 'the', 'maintenance', 'and', 'genotyping', 'of', 'transgenic', 'mice', 'colnies', ',', 'reagent', 'and', 'consumable', 'monitoring', 'and', 'monitoring', 'and', 'technician', 'support', 'for', 'laboratory',

In [None]:
vocab = []
maxsize_x1 = 0
for sentence in sentencesMCTIList_xp1:
  if len(sentence) > maxsize_x1:
    maxsize_x1 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x1 = len(set(vocab))
print(str(vocab_size_x1))
print(maxsize_x1)

23763
5636


In [None]:
i = 0
best_i = 0
best_f1_xp1 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp1, test_data = train_network(sentencesMCTIList_xp1, labelsMCTI, 
                              "best weights xp1-" + str(i) + ".h5")
  if current_f1_xp1 > best_f1_xp1:
    best_f1_xp1 = current_f1_xp1
    best_i = i
  i+=1
print("Best XP1 data preprocessing F1 Score: " + str(best_f1_xp1))
print(best_i)

### 2. Expandir Contrações + Transformar texto em minúsculo

In [None]:
sentencesLowered = []
for sentence in sentencesExpanded:
  sentencesLowered.append(sentence.lower())
print(sentencesLowered[450])

opportunity for a (01) technical trs training vacancy trs (tt-iii) with fapesp scholarship in the young researcher immodulation project of iron homeosty and registration of the signal typersine signal quinasine tam during mycobacterium tuberculosis: treats for development of host-targeted immunopharmacological therapies, developed at the department of bioquica and immunology, ribeiro preto medical school of the university of so paulo (fmrp-usp), under the coordination of dr. diego lus costa. the selected candidate assist in the maintenance and genotyping of transgenic mice colnies, reagent and consumable monitoring and monitoring and technician support for laboratory. @usp.br.more information about requirements and benefits of fapesp tt-iii scholarship is at fapesp.br/3098 and fapesp.br/3162.


In [None]:
sentencesMCTIList_xp2 = []
for sentence in sentencesLowered:
  sentencesMCTIList_xp2.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp2[450])

['opportunity', 'for', 'a', '(', '01', ')', 'technical', 'trs', 'training', 'vacancy', 'trs', '(', 'tt-iii', ')', 'with', 'fapesp', 'scholarship', 'in', 'the', 'young', 'researcher', 'immodulation', 'project', 'of', 'iron', 'homeosty', 'and', 'registration', 'of', 'the', 'signal', 'typersine', 'signal', 'quinasine', 'tam', 'during', 'mycobacterium', 'tuberculosis', ':', 'treats', 'for', 'development', 'of', 'host-targeted', 'immunopharmacological', 'therapies', ',', 'developed', 'at', 'the', 'department', 'of', 'bioquica', 'and', 'immunology', ',', 'ribeiro', 'preto', 'medical', 'school', 'of', 'the', 'university', 'of', 'so', 'paulo', '(', 'fmrp-usp', ')', ',', 'under', 'the', 'coordination', 'of', 'dr.', 'diego', 'lus', 'costa', '.', 'the', 'selected', 'candidate', 'assist', 'in', 'the', 'maintenance', 'and', 'genotyping', 'of', 'transgenic', 'mice', 'colnies', ',', 'reagent', 'and', 'consumable', 'monitoring', 'and', 'monitoring', 'and', 'technician', 'support', 'for', 'laboratory',

In [None]:
vocab = []
maxsize_x2 = 0
for sentence in sentencesMCTIList_xp2:
  if len(sentence) > maxsize_x2:
    maxsize_x2 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x2 = len(set(vocab))
print(str(vocab_size_x2))
print(maxsize_x2)

20320
5629


In [None]:
i = 0
best_i = 0
best_f1_xp2 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp2, test_data = train_network(sentencesMCTIList_xp2, labelsMCTI, 
                              "best weights xp2-" + str(i) + ".h5")
  if current_f1_xp2 > best_f1_xp2:
    best_f1_xp2 = current_f1_xp2
    best_i = i
  i+=1
print("Best XP2 data preprocessing F1 Score: " + str(best_f1_xp2))
print(best_i)

### 3. Expandir Contrações + Remover Pontuação

In [None]:
sentencesWithoutPunctuation = []
for sentence in sentencesExpanded:
  sentencesWithoutPunctuation.append(remove_special_characters(sentence, remove_digits=True))
print(sentencesWithoutPunctuation[450])

OPPORTUNITY FOR A  TECHNICAL TRS Training Vacancy TRS TTIII with FAPESP Scholarship in the Young Researcher IMMODULATION PROJECT OF IRON HOMEOSTY AND REGISTRATION OF THE SIGNAL TYPERSINE SIGNAL QUINASINE TAM DURING MYCOBACTERIUM TUBERCULOSIS TREATS FOR DEVELOPMENT of hosttargeted immunopharmacological therapies developed at the Department of Bioquica and Immunology Ribeiro Preto Medical School of the University of So Paulo FMRPUSP under the coordination of Dr Diego Lus Costa The selected candidate assist in the maintenance and genotyping of transgenic mice colnies reagent and consumable monitoring and monitoring and technician support for laboratory USPbrMore information about requirements and benefits of FAPESP TTIII Scholarship is at fapespbr and fapespbr


In [None]:
sentencesMCTIList_xp3 = []
for sentence in sentencesWithoutPunctuation:
  sentencesMCTIList_xp3.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp3[450])

['OPPORTUNITY', 'FOR', 'A', 'TECHNICAL', 'TRS', 'Training', 'Vacancy', 'TRS', 'TTIII', 'with', 'FAPESP', 'Scholarship', 'in', 'the', 'Young', 'Researcher', 'IMMODULATION', 'PROJECT', 'OF', 'IRON', 'HOMEOSTY', 'AND', 'REGISTRATION', 'OF', 'THE', 'SIGNAL', 'TYPERSINE', 'SIGNAL', 'QUINASINE', 'TAM', 'DURING', 'MYCOBACTERIUM', 'TUBERCULOSIS', 'TREATS', 'FOR', 'DEVELOPMENT', 'of', 'hosttargeted', 'immunopharmacological', 'therapies', 'developed', 'at', 'the', 'Department', 'of', 'Bioquica', 'and', 'Immunology', 'Ribeiro', 'Preto', 'Medical', 'School', 'of', 'the', 'University', 'of', 'So', 'Paulo', 'FMRPUSP', 'under', 'the', 'coordination', 'of', 'Dr', 'Diego', 'Lus', 'Costa', 'The', 'selected', 'candidate', 'assist', 'in', 'the', 'maintenance', 'and', 'genotyping', 'of', 'transgenic', 'mice', 'colnies', 'reagent', 'and', 'consumable', 'monitoring', 'and', 'monitoring', 'and', 'technician', 'support', 'for', 'laboratory', 'USPbrMore', 'information', 'about', 'requirements', 'and', 'benefits

In [None]:
vocab = []
maxsize_x3 = 0
for sentence in sentencesMCTIList_xp3:
  if len(sentence) > maxsize_x3:
    maxsize_x3 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x3 = len(set(vocab))
print(str(vocab_size_x3))
print(maxsize_x3)

22116
4950


In [None]:
i = 0
best_i = 0
best_f1_xp3 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp3, test_data = train_network(sentencesMCTIList_xp3, labelsMCTI, 
                              "best weights xp3-" + str(i) + ".h5")
  if current_f1_xp3 > best_f1_xp3:
    best_f1_xp3 = current_f1_xp3
    best_i = i
  i+=1
print("Best XP3 data preprocessing F1 Score: " + str(best_f1_xp3))
print(best_i)

### 4. Expandir Contrações + Remover Pontuação + Transformar texto em minúsculo

In [None]:
sentencesLoweredFinal = []
for sentence in sentencesWithoutPunctuation:
  sentencesLoweredFinal.append(sentence.lower())
print(sentencesLoweredFinal[450])

opportunity for a  technical trs training vacancy trs ttiii with fapesp scholarship in the young researcher immodulation project of iron homeosty and registration of the signal typersine signal quinasine tam during mycobacterium tuberculosis treats for development of hosttargeted immunopharmacological therapies developed at the department of bioquica and immunology ribeiro preto medical school of the university of so paulo fmrpusp under the coordination of dr diego lus costa the selected candidate assist in the maintenance and genotyping of transgenic mice colnies reagent and consumable monitoring and monitoring and technician support for laboratory uspbrmore information about requirements and benefits of fapesp ttiii scholarship is at fapespbr and fapespbr


In [None]:
sentencesMCTIList_xp4 = []
for sentence in sentencesLoweredFinal:
  sentencesMCTIList_xp4.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp4[450])

['opportunity', 'for', 'a', 'technical', 'trs', 'training', 'vacancy', 'trs', 'ttiii', 'with', 'fapesp', 'scholarship', 'in', 'the', 'young', 'researcher', 'immodulation', 'project', 'of', 'iron', 'homeosty', 'and', 'registration', 'of', 'the', 'signal', 'typersine', 'signal', 'quinasine', 'tam', 'during', 'mycobacterium', 'tuberculosis', 'treats', 'for', 'development', 'of', 'hosttargeted', 'immunopharmacological', 'therapies', 'developed', 'at', 'the', 'department', 'of', 'bioquica', 'and', 'immunology', 'ribeiro', 'preto', 'medical', 'school', 'of', 'the', 'university', 'of', 'so', 'paulo', 'fmrpusp', 'under', 'the', 'coordination', 'of', 'dr', 'diego', 'lus', 'costa', 'the', 'selected', 'candidate', 'assist', 'in', 'the', 'maintenance', 'and', 'genotyping', 'of', 'transgenic', 'mice', 'colnies', 'reagent', 'and', 'consumable', 'monitoring', 'and', 'monitoring', 'and', 'technician', 'support', 'for', 'laboratory', 'uspbrmore', 'information', 'about', 'requirements', 'and', 'benefits

In [None]:
vocab = []
maxsize_x4 = 0
for sentence in sentencesMCTIList_xp4:
  if len(sentence) > maxsize_x4:
    maxsize_x4 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x4 = len(set(vocab))
print(str(vocab_size_x4))
print(maxsize_x4)

18614
4950


In [None]:
i = 0
best_i = 0
best_f1_xp4 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp4, test_data = train_network(sentencesMCTIList_xp4, labelsMCTI, 
                              "best weights xp4-" + str(i) + ".h5")
  if current_f1_xp4 > best_f1_xp4:
    best_f1_xp4 = current_f1_xp4
    best_i = i
  i+=1
print("Best XP4 data preprocessing F1 Score: " + str(best_f1_xp4))
print(best_i)

## Simplificação do Conteúdo

### 5. Xp4 + Stemização

In [None]:
ps = PorterStemmer()

In [None]:
sentencesStemmed = []
for sentence in sentencesMCTIList_xp4:
  sentenceList = []
  for word in sentence:
    sentenceList.append(ps.stem(word))
  sentencesStemmed.append(' '.join(w for w in sentenceList))
print(sentencesStemmed[450])

opportun for a technic tr train vacanc tr ttiii with fapesp scholarship in the young research immodul project of iron homeosti and registr of the signal typersin signal quinasin tam dure mycobacterium tuberculosi treat for develop of hosttarget immunopharmacolog therapi develop at the depart of bioquica and immunolog ribeiro preto medic school of the univers of so paulo fmrpusp under the coordin of dr diego lu costa the select candid assist in the mainten and genotyp of transgen mice colni reagent and consum monitor and monitor and technician support for laboratori uspbrmor inform about requir and benefit of fapesp ttiii scholarship is at fapespbr and fapespbr


In [None]:
sentencesMCTIList_xp5 = []
for sentence in sentencesStemmed:
  sentencesMCTIList_xp5.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp5[450])

['opportun', 'for', 'a', 'technic', 'tr', 'train', 'vacanc', 'tr', 'ttiii', 'with', 'fapesp', 'scholarship', 'in', 'the', 'young', 'research', 'immodul', 'project', 'of', 'iron', 'homeosti', 'and', 'registr', 'of', 'the', 'signal', 'typersin', 'signal', 'quinasin', 'tam', 'dure', 'mycobacterium', 'tuberculosi', 'treat', 'for', 'develop', 'of', 'hosttarget', 'immunopharmacolog', 'therapi', 'develop', 'at', 'the', 'depart', 'of', 'bioquica', 'and', 'immunolog', 'ribeiro', 'preto', 'medic', 'school', 'of', 'the', 'univers', 'of', 'so', 'paulo', 'fmrpusp', 'under', 'the', 'coordin', 'of', 'dr', 'diego', 'lu', 'costa', 'the', 'select', 'candid', 'assist', 'in', 'the', 'mainten', 'and', 'genotyp', 'of', 'transgen', 'mice', 'colni', 'reagent', 'and', 'consum', 'monitor', 'and', 'monitor', 'and', 'technician', 'support', 'for', 'laboratori', 'uspbrmor', 'inform', 'about', 'requir', 'and', 'benefit', 'of', 'fapesp', 'ttiii', 'scholarship', 'is', 'at', 'fapespbr', 'and', 'fapespbr']


In [None]:
vocab = []
maxsize_x5 = 0
for sentence in sentencesMCTIList_xp5:
  if len(sentence) > maxsize_x5:
    maxsize_x5 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x5 = len(set(vocab))
print(str(vocab_size_x5))
print(maxsize_x5)

14317
4950


In [None]:
i = 0
best_i = 0
best_f1_xp5 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp5, test_data = train_network(sentencesMCTIList_xp5, labelsMCTI, 
                              "best weights xp5-" + str(i) + ".h5")
  if current_f1_xp5 > best_f1_xp5:
    best_f1_xp5 = current_f1_xp5
    best_i = i
  i+=1
print("Best XP5 data preprocessing F1 Score: " + str(best_f1_xp5))
print(best_i)

### 6. Xp4 + Lematização

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
sentencesLemmatized = []
for sentence in sentencesLoweredFinal:
  sentencesLemmatized.append(spacy_lemmatize_text(sentence))
print(sentencesLemmatized[450])

opportunity for a   technical trs training vacancy trs ttiii with fapesp scholarship in the young researcher immodulation project of iron homeosty and registration of the signal typersine signal quinasine tam during mycobacterium tuberculosis treat for development of hosttargete immunopharmacological therapy develop at the department of bioquica and immunology ribeiro preto medical school of the university of so paulo fmrpusp under the coordination of dr diego lus costa the select candidate assist in the maintenance and genotyping of transgenic mouse colnie reagent and consumable monitoring and monitoring and technician support for laboratory uspbrmore information about requirement and benefit of fapesp ttiii scholarship be at fapespbr and fapespbr


In [None]:
sentencesMCTIList_xp6 = []
for sentence in sentencesLemmatized:
  sentencesMCTIList_xp6.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp6[450])

['opportunity', 'for', 'a', 'technical', 'trs', 'training', 'vacancy', 'trs', 'ttiii', 'with', 'fapesp', 'scholarship', 'in', 'the', 'young', 'researcher', 'immodulation', 'project', 'of', 'iron', 'homeosty', 'and', 'registration', 'of', 'the', 'signal', 'typersine', 'signal', 'quinasine', 'tam', 'during', 'mycobacterium', 'tuberculosis', 'treat', 'for', 'development', 'of', 'hosttargete', 'immunopharmacological', 'therapy', 'develop', 'at', 'the', 'department', 'of', 'bioquica', 'and', 'immunology', 'ribeiro', 'preto', 'medical', 'school', 'of', 'the', 'university', 'of', 'so', 'paulo', 'fmrpusp', 'under', 'the', 'coordination', 'of', 'dr', 'diego', 'lus', 'costa', 'the', 'select', 'candidate', 'assist', 'in', 'the', 'maintenance', 'and', 'genotyping', 'of', 'transgenic', 'mouse', 'colnie', 'reagent', 'and', 'consumable', 'monitoring', 'and', 'monitoring', 'and', 'technician', 'support', 'for', 'laboratory', 'uspbrmore', 'information', 'about', 'requirement', 'and', 'benefit', 'of', '

In [None]:
vocab = []
maxsize_x6 = 0
for sentence in sentencesMCTIList_xp6:
  if len(sentence) > maxsize_x6:
    maxsize_x6 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x6 = len(set(vocab))
print(str(vocab_size_x6))
print(maxsize_x6)

16191
4950


In [None]:
i = 0
best_i = 0
best_f1_xp6 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp6, test_data = train_network(sentencesMCTIList_xp6, labelsMCTI, 
                              "best weights xp6-" + str(i) + ".h5")
  if current_f1_xp6 > best_f1_xp6:
    best_f1_xp6 = current_f1_xp6
    best_i = i
  i+=1
print("Best XP6 data preprocessing F1 Score: " + str(best_f1_xp6))
print(best_i)

### 7. Xp4 + Stemização + Remoção de StopWords

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
sentencesStemStopped = []
for sentence in sentencesStemmed:
  sentencesStemStopped.append(remove_stopwords(sentence, is_lower_case=False))
print(sentencesStemStopped[450])

opportun technic tr train vacanc tr ttiii fapesp scholarship young research immodul project iron homeosti registr signal typersin signal quinasin tam dure mycobacterium tuberculosi treat develop hosttarget immunopharmacolog therapi develop depart bioquica immunolog ribeiro preto medic school univers paulo fmrpusp coordin dr diego lu costa select candid assist mainten genotyp transgen mice colni reagent consum monitor monitor technician support laboratori uspbrmor inform requir benefit fapesp ttiii scholarship fapespbr fapespbr


In [None]:
sentencesMCTIList_xp7 = []
for sentence in sentencesStemStopped:
  sentencesMCTIList_xp7.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp7[450])

['opportun', 'technic', 'tr', 'train', 'vacanc', 'tr', 'ttiii', 'fapesp', 'scholarship', 'young', 'research', 'immodul', 'project', 'iron', 'homeosti', 'registr', 'signal', 'typersin', 'signal', 'quinasin', 'tam', 'dure', 'mycobacterium', 'tuberculosi', 'treat', 'develop', 'hosttarget', 'immunopharmacolog', 'therapi', 'develop', 'depart', 'bioquica', 'immunolog', 'ribeiro', 'preto', 'medic', 'school', 'univers', 'paulo', 'fmrpusp', 'coordin', 'dr', 'diego', 'lu', 'costa', 'select', 'candid', 'assist', 'mainten', 'genotyp', 'transgen', 'mice', 'colni', 'reagent', 'consum', 'monitor', 'monitor', 'technician', 'support', 'laboratori', 'uspbrmor', 'inform', 'requir', 'benefit', 'fapesp', 'ttiii', 'scholarship', 'fapespbr', 'fapespbr']


In [None]:
vocab = []
maxsize_x7 = 0
for sentence in sentencesMCTIList_xp7:
  if len(sentence) > maxsize_x7:
    maxsize_x7 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x7 = len(set(vocab))
print(str(vocab_size_x7))
print(maxsize_x7)

14210
2817


In [None]:
i = 0
best_i = 0
best_f1_xp7 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp7, test_data = train_network(sentencesMCTIList_xp7, labelsMCTI, 
                              "best weights xp7-" + str(i) + ".h5")
  if current_f1_xp7 > best_f1_xp7:
    best_f1_xp7 = current_f1_xp7
    best_i = i
  i+=1
print("Best XP7 data preprocessing F1 Score: " + str(best_f1_xp7))
print(best_i)

### 8. Xp4 + Lematização + Remoção de StopWords

In [None]:
sentencesLemStopped = []
for sentence in sentencesLemmatized:
  sentencesLemStopped.append(remove_stopwords(sentence, is_lower_case=False))
print(sentencesLemStopped[450])

opportunity technical trs training vacancy trs ttiii fapesp scholarship young researcher immodulation project iron homeosty registration signal typersine signal quinasine tam mycobacterium tuberculosis treat development hosttargete immunopharmacological therapy develop department bioquica immunology ribeiro preto medical school university paulo fmrpusp coordination dr diego lus costa select candidate assist maintenance genotyping transgenic mouse colnie reagent consumable monitoring monitoring technician support laboratory uspbrmore information requirement benefit fapesp ttiii scholarship fapespbr fapespbr


In [None]:
sentencesMCTIList_xp8 = []
for sentence in sentencesLemStopped:
  sentencesMCTIList_xp8.append(nltk.word_tokenize(sentence))
print(sentencesMCTIList_xp8[450])

['opportunity', 'technical', 'trs', 'training', 'vacancy', 'trs', 'ttiii', 'fapesp', 'scholarship', 'young', 'researcher', 'immodulation', 'project', 'iron', 'homeosty', 'registration', 'signal', 'typersine', 'signal', 'quinasine', 'tam', 'mycobacterium', 'tuberculosis', 'treat', 'development', 'hosttargete', 'immunopharmacological', 'therapy', 'develop', 'department', 'bioquica', 'immunology', 'ribeiro', 'preto', 'medical', 'school', 'university', 'paulo', 'fmrpusp', 'coordination', 'dr', 'diego', 'lus', 'costa', 'select', 'candidate', 'assist', 'maintenance', 'genotyping', 'transgenic', 'mouse', 'colnie', 'reagent', 'consumable', 'monitoring', 'monitoring', 'technician', 'support', 'laboratory', 'uspbrmore', 'information', 'requirement', 'benefit', 'fapesp', 'ttiii', 'scholarship', 'fapespbr', 'fapespbr']


In [None]:
vocab = []
maxsize_x8 = 0
for index, sentence in enumerate(sentencesMCTIList_xp8):
  if len(sentence) > maxsize_x8:
    maxsize_x8 = len(sentence)
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size_x8 = len(set(vocab))
print(str(vocab_size_x8))
print(maxsize_x8)

16078
2726


In [None]:
i = 0
best_i = 0
best_f1_xp8 = 0
test_data = []
while i < 10:
  print("STARTING TRAINING #" + str(i))
  current_f1_xp8, test_data = train_network(sentencesMCTIList_xp8, labelsMCTI, 
                              "best weights xp8-" + str(i) + ".h5")
  if current_f1_xp8 > best_f1_xp8:
    best_f1_xp8 = current_f1_xp8
    best_i = i
  i+=1
print("Best XP8 data preprocessing F1 Score: " + str(best_f1_xp8))
print(best_i)

# Resultados e Discussões
Apanhado geral dos resultados obtidos com os experimentos e conclusão das melhores técnicas a serem utilizadas para o nosso problema específico.

## Resumo

In [1]:
from tabulate import tabulate

In [3]:
resumo_dados = [
    ["Base", "Textos originais", 89.78, 84.20, 79.09, 90.95, 417.77, 23788, 5636],
    ["xp1", "Expandindo contrações", 88.71, 81.59, 71.54, 97.33, 414.72, 23768, 5636],
    ["xp2", "xp1 + minúsculo", 90.32, 85.64, 77.19, 97.44, 368.38, 20322, 5629],
    ["xp3", "xp1 - pontuação", 91.94, 87.73, 79.66, 98.72, 386.65, 22121, 4950],
    ["xp4", "xp2 - pontuação", 90.86, 86.61, 80.85, 94.25, 326.83, 18616, 4950],
    ["xp5", "xp4 + stemização", 91.94, 87.68, 78.47, 100.00, 257.96, 14319, 4950], 
    ["xp6", "xp4 + lematização", 89.78, 85.06, 79.66, 91.87, 282.645, 16194, 4950], 
    ["xp7", "xp5 - stopwords", 92.47, 88.46, 79.66, 100.00, 210.32, 14212, 2817], 
    ["xp8", "xp6 - stopwords", 92.47, 88.46, 79.66, 100.00, 225.58, 16081, 2726], 
]
headers = ["Experimento", "Descrição", "Acurácia (%)", "F1-score", "Recall", "Precisão", "Tempo de treino(s)","N tokens únicos", "Tamanho máximo de sentença"]
print(tabulate(resumo_dados, headers))

Experimento    Descrição                Acurácia (%)    F1-score    Recall    Precisão    Tempo de treino(s)    N tokens únicos    Tamanho máximo de sentença
-------------  ---------------------  --------------  ----------  --------  ----------  --------------------  -----------------  ----------------------------
Base           Textos originais                89.78       84.2      79.09       90.95               417.77               23788                          5636
xp1            Expandindo contrações           88.71       81.59     71.54       97.33               414.72               23768                          5636
xp2            xp1 + minúsculo                 90.32       85.64     77.19       97.44               368.38               20322                          5629
xp3            xp1 - pontuação                 91.94       87.73     79.66       98.72               386.65               22121                          4950
xp4            xp2 - pontuação                 90.86

## XP7 vs XP8

Os dois experimentos apresentaram ótimos resultados de acurácia, f1-score, recall e precisão. Obtiveram os menores tempos de treinamento e os menores tamanhos de sentença. Qualquer uma das técnicas pode ser escolhida para a sequencia do trabalho.

Dentre as duas excelentes opções precisamos julgar qual deve ser escolhida.
XP7: Possui menor tempo de treinamento, menor número de tokens únicos
XP8: Possui menor tamanhos máximos

O critério utilizado para a escolha foi o custo computacional necessário para treinar os modelos de representação vetorial (word-embedding, sentence-embeddings, document-embedding). O tempo de treinamento é tão próximo que não possuiu um peso tão grande para a análise.
- Keras Embedding:
  - Para o keras embedding o número de tokens únicos maior apenas significa um one-hot-encoding de maior vocabulario, mas isso não aumenta o tamanho da rede.
  - Já o tamanho da maior string modifica o tamanho do input da rede necessária pra treinar e isso é transcrito também na quantidade de pesos da camada seguinte.
- Word2Vec:
  - No Word2Vec o número de tokens únicos maior significa um maior tempo para o pré-treino da rede. Como esse pré-treinamento só será feito uma vez, a interferência não é tão grande. Além disso o arquivo de pesos pré-treinados do Word2Vec será maior, e ele precisará ser carregado em memória para traduzir o input para a representação vetorial. Porém, ele também pode ser descarregado da memória logo após a tradução, e não deve influenciar na memória gasta em treino.
  - Já o tamanho maior da string quase não interfere no treinamento do Word2Vec, porém no input da rede final, e na quantidade de dados carregados em memória após a representação vetorial.
- Longformer:
  - O longformer já utilizado já foi pré-treinado com muito mais tokens do que os do trabalho, então o número de tokens únicos é indiferente.
  - O tamanho maior da string também interfere muito pouco, considerando que os dois valores (2817 e 2726) são abaixo dos 4096 do tamanho da rede. Porém devemos considerar que quando o notebook for aplicado para o uso do ministério possuir um modelo capaz de reduzir mais o tamanho do input pode significar menos informação sendo truncada ou desconsiderada por ser maior que o limite máximo da rede.

Diante da análise descrita, o melhor modelo de pré-processamento para esta etapa do projeto é a do **experimento 8**, que possui o menor tamanho de input para o treinamento e tende a reduzir o tamanho da entrada para o uso futuro no ministério, possivelmente impedindo de que informação seja truncada por ser maior que o limite máximo da rede.
Vale notar que o tamanho da rede, para o caso do Keras embedding ou Word2Vec, pode ser definido pelo grupo de classificação para o valor julgado mais adequado, não necessitando se ater aos valores de encontrados no pré-processamento. Essa definição do tamanho da rede precisará ser feita depois do teste dos  modelo, que utilizarão os tamanhos mínimos encontrados aqui no pré-processamento, no caso desses modelos obterem os melhores resultados. 

## Melhor modelo

In [105]:
sentencesMCTIList_xp8 = read_list("https://github.com/chap0lin/PPF-MCTI/blob/master/Pre-processamento/xp8_list.json?raw=true")
sentencesMCTIList_xp8_sentences = read_list("https://github.com/chap0lin/PPF-MCTI/blob/master/Pre-processamento/xp8_sent.json?raw=true")
labels = read_labels("https://github.com/chap0lin/PPF-MCTI/blob/master/Pre-processamento/labels.npy?raw=true")

In [74]:
vocab = []
for sentence in sentencesMCTIList_xp8:
  for word in sentence:
    if word not in vocab:
      vocab.append(word)

vocab_size = len(set(vocab))
input_vector = []
i = 0
for sentence in sentencesMCTIList_xp8:
  input_vector.append([one_hot(word, vocab_size, filters='') for word in sentence])

max_size = 0
for sentence in input_vector:
  if len(sentence) > max_size:
    max_size = len(sentence)

input_vector = pad_sequences(input_vector, maxlen=max_size, padding='pre')

In [75]:
x_train, x_test, y_train, y_test = train_test_split(input_vector, 
                                                      labels, 
                                                      test_size=0.20, 
                                                      random_state=20)

In [76]:
get_model("best weights xp8.h5", "https://github.com/chap0lin/PPF-MCTI/blob/master/Pre-processamento/Pesos/best%20weights%20xp8.h5?raw=true")

In [None]:
path = "best weights xp8.h5"

reconstructed_model_NN = keras.models.load_model(path, 
                                                 custom_objects={'f1_m':f1_m, 
                                                                 "precision_m":precision_m, 
                                                                 "recall_m":recall_m})
# evaluate the model
loss, accuracy, f1_score, precision, recall = reconstructed_model_NN.evaluate(x_test, 
                                                                              y_test, 
                                                                              verbose=0)

print("XP8:")
print('Accuracy NN: %f' % (accuracy*100))
print('f1_score NN: %f' % (f1_score*100))
print('precision NN: %f' % (precision*100))
print('recall NN: %f' % (recall*100))

XP8:
Accuracy NN: 92.473119
f1_score NN: 88.460702
precision NN: 100.000000
recall NN: 79.660153


# Conclusão

Agora que possuímos o melhor modelo de pré-procesamento, o último passo é gerar a planilha com os campos `opo_pre` e `opo_pre_tkn` contendo o texto pré-processado em formato de sentença e tokens respectivamente.



In [107]:
dataMCTI['opo_pre_tkn'] = sentencesMCTIList_xp8
dataMCTI['opo_pre'] = sentencesMCTIList_xp8_sentences
dataMCTI.head()

Unnamed: 0.2,Ordem,Unnamed: 0,Unnamed: 0.1,atualizacao,codigo,link,opo_deadline,opo_texto,opo_texto_ele,opo_tipo,...,opo_brazil,classificado,Tamanho do Texto,Tamanho doTítulo,Quantidade de Sentenças no Texto,Média Sentença,QTD Branco,opo_texto.1,opo_pre_tkn,opo_pre
0,0,0,0,220329,dfg_220329_1_000,https://www.dfg.de/en/research_funding/announc...,,"The Deutsche Forschungsgemeinschaft (DFG, Germ...","The Deutsche Forschungsgemeinschaft (DFG, Germ...",other,...,Y,S,3487,123,34,102,455,"The Deutsche Forschungsgemeinschaft (DFG, Germ...","[deutsche, forschungsgemeinschaft, dfg, german...",deutsche forschungsgemeinschaft dfg german res...
1,1,1,1,220329,dfg_220329_1_001,https://www.dfg.de/en/research_funding/announc...,second,"In March 2018, the Senate of the Deutsche Fors...","In March 2018, the Senate of the Deutsche Fors...",other,...,N,S,7280,92,67,108,945,"In March 2018, the Senate of the Deutsche Fors...","[march, senate, deutsche, forschungsgemeinscha...",march senate deutsche forschungsgemeinschaft d...
2,2,2,2,220329,dfg_220329_1_002,https://www.dfg.de/en/research_funding/announc...,JSPS-DFG 2022,JSPS-DFG 2022,JSPS-DFG 2022,other,...,N,S,13,127,0,0,1,JSPS-DFG 2022,[jspsdfg],jspsdfg
3,3,3,3,220329,dfg_220329_1_003,https://www.dfg.de/en/research_funding/announc...,########,"The Deutsche Forschungsgemeinschaft (DFG, Germ...","The Deutsche Forschungsgemeinschaft (DFG, Germ...",grant,...,N,S,5325,71,60,88,642,"The Deutsche Forschungsgemeinschaft (DFG, Germ...","[deutsche, forschungsgemeinschaft, dfg, german...",deutsche forschungsgemeinschaft dfg german res...
4,4,4,4,220329,dfg_220329_1_004,https://www.dfg.de/en/research_funding/announc...,,Within the current funding initiative on next ...,Within the current funding initiative on next ...,grant,...,N,S,7278,28,77,94,1002,Within the current funding initiative on next ...,"[within, current, funding, initiative, next, g...",within current funding initiative next generat...


In [108]:
dataMCTI.to_excel("oportunidades_final_pre_processado.xlsx", index=False)