In [1]:
import pandas as pd

import numpy as np

import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

from transformers import AutoModel, AutoTokenizer

from optimum.bettertransformer import BetterTransformer

import torch

import progressbar

import csv

import sys
np.set_printoptions(threshold=sys.maxsize, linewidth=sys.maxsize)

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Abrindo o objeto dataframe
with open(r'df-tokenizado.pickle', 'rb') as pickledfile:
    df = pickle.load(pickledfile)

In [3]:
y = LabelEncoder().fit_transform(X = df['Assunto'].values.reshape(-1,1))

  y = column_or_1d(y, warn=True)


<h2>Modelo Random Forest - Bag of Words</h2>

In [6]:
# Abrindo o bag-of-words
with open(r'bow.pickle', 'rb') as pickledfile:
    x = pickle.load(pickledfile)

In [15]:
model = RandomForestClassifier()
cv = StratifiedKFold(n_splits = 10)
result = cross_validate(model, x, y, cv = cv, return_estimator=True, return_train_score=True)

In [16]:
result

{'fit_time': array([809.31716967, 755.54001474, 775.33718395, 775.72851467,
        808.54982686, 792.01552534, 787.40804529, 774.25614858,
        744.84567404, 785.46699023]),
 'score_time': array([11.29275179, 11.10722494, 11.09892225, 10.61786509, 10.70986724,
        10.9108572 , 10.29583907, 10.68187094, 10.3858459 , 10.412848  ]),
 'estimator': [RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier()],
 'test_score': array([0.80153378, 0.83908502, 0.84966283, 0.8769007 , 0.94182203,
        0.93203755, 0.93428534, 0.86275288, 0.75247917, 0.83670501]),
 'train_score': array([0.99753184, 0.99665036, 0.99669443, 0.99667974, 0.99615085,
        0.99616554, 0.99609209, 0.99663567, 0.99739962, 0.99651814])}

In [23]:
# Salvando o resultado do random-forest com bow
with open(r'random-forest-bow.pickle', 'wb') as pickledfile:
    pickle.dump(result, pickledfile, protocol=pickle.HIGHEST_PROTOCOL)

<h2>Modelo Random Forest - TF-IDF</h2>

In [4]:
# Abrindo o tf-idf
with open(r'tfidf.pickle', 'rb') as pickledfile:
    x2 = pickle.load(pickledfile)

In [5]:
model2 = RandomForestClassifier()
cv2 = StratifiedKFold(n_splits = 10)
result2 = cross_validate(model2, x2, y, cv = cv2, return_estimator=True, return_train_score=True)

In [6]:
result2

{'fit_time': array([967.43807101, 785.01799035, 789.53803682, 830.43593884,
        834.32326269, 812.32375121, 862.92664886, 866.80971766,
        846.82298827, 810.53587508]),
 'score_time': array([2.96615481, 2.95326281, 3.20907474, 2.88188934, 3.19067645,
        2.66003966, 2.67481351, 2.02461958, 3.60629368, 2.57268906]),
 'estimator': [RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier()],
 'test_score': array([0.79293931, 0.82837498, 0.84728282, 0.87266958, 0.94023536,
        0.92674864, 0.93296311, 0.86209176, 0.7508925 , 0.83577945]),
 'train_score': array([0.99753184, 0.99663567, 0.99669443, 0.99667974, 0.99613616,
        0.99616554, 0.99609209, 0.99663567, 0.99739962, 0.99651814])}

In [8]:
# Salvando o resultado do random-forest com tfidf
with open(r'random-forest-tfidf.pickle', 'wb') as pickledfile:
    pickle.dump(result2, pickledfile, protocol=pickle.HIGHEST_PROTOCOL)

<h2>Modelo Random Forest - BERT</h2>

In [15]:
bertmodel = BetterTransformer.transform(AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased'))

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [16]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')

In [22]:
# Obtendo o último hidden state para o token [CLS]

widgets = [' [',
         progressbar.Timer(),
         '] ',
           progressbar.Bar('*'),' (',
           progressbar.ETA(), ') ',
          ]
bar = progressbar.ProgressBar(maxval = len(df), widgets=widgets).start()

# salvando para CSV, assim caso o computador desligue, o progresso não será perdido
with open(r'embeddings.csv', 'w') as csvfile:
  csvwriter = csv.writer(csvfile, delimiter=';', lineterminator = '\n')
  csvwriter.writerow(['Index', 'Embedding'])

  with torch.no_grad():
    for i in range(len(df)):
      embeddings = bertmodel(tokenizer.encode(df['Texto tratado'][i], return_tensors='pt', max_length=512, truncation=True))['last_hidden_state'][0][0].cpu().detach().numpy()
      # salvando para CSV, assim caso o computador desligue, o progresso não será perdido
      csvwriter.writerow([i, embeddings])
      bar.update(i+1)

 [Elapsed Time: 3 days, 1:14:09] |*********************     | (ETA:  14:11:06) 

In [3]:
# [Segunda execução] Levantando o último ID e iniciando novamente de onde parou a última execução
with open(r'embeddings.csv') as f:
    last_id = sum(1 for line in f) - 2

In [18]:
# [Segunda execução] Obtendo o último hidden state para o token [CLS]

widgets = [' [',
         progressbar.Timer(),
         '] ',
           progressbar.Bar('*'),' (',
           progressbar.ETA(), ') ',
          ]
bar = progressbar.ProgressBar(maxval = len(df) - (last_id+1) , widgets=widgets).start()

# salvando para CSV, assim caso o computador desligue, o progresso não será perdido
with open(r'embeddings2.csv', 'w') as csvfile:
  csvwriter = csv.writer(csvfile, delimiter=';', lineterminator = '\n')
  csvwriter.writerow(['Index', 'Embedding'])

  with torch.no_grad():
    for i in range(last_id+1, len(df)):
      embeddings = bertmodel(tokenizer.encode(df['Texto tratado'][i], return_tensors='pt', max_length=512, truncation=True))['last_hidden_state'][0][0].cpu().detach().numpy()
      # salvando para CSV, assim caso o computador desligue, o progresso não será perdido
      csvwriter.writerow([i, embeddings])
      bar.update(i - last_id)

 [Elapsed Time: 12:03:43] |**********************************| (ETA:  0:00:00) 

In [46]:
embeddings = pd.concat(
                        [
                            pd.read_csv(r'embeddings.csv', sep=';'), 
                            pd.read_csv(r'embeddings2.csv', sep=';')
                        ]
                      )

In [47]:
embeddings.index = embeddings['Index'].values
embeddings.drop('Index', axis=1, inplace=True)

In [48]:
for i in range(len(embeddings)):
    row = embeddings['Embedding'][i].replace('[', '').replace(']', '').split(' ')
    while '' in row:
        row.remove('')
    row = np.array([float(elem) for elem in row])
    embeddings.loc[i, 'Embedding'] = row

In [62]:
model3 = RandomForestClassifier()
cv3 = StratifiedKFold(n_splits = 10)
result3 = cross_validate(model3, embeddings['Embedding'].to_list(), y, cv = cv3, return_estimator=True, return_train_score=True)

In [65]:
result3['test_score']

0.8196482877165148