In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np

In [2]:
import torch
from transformers import  AutoTokenizer, AutoModel
from torch import nn

In [26]:
from src.sentenceregbr import SentenceRegBr
from src.preprocessing import PreProcessing

In [4]:
#Bert 
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [132]:
BASE_DIR = Path(os.path.abspath('')).resolve()
objSentecesRegBr = SentenceRegBr()
### Ds Labels
labels = {"outros":0
          ,"terceiro_setor":1
          ,"nao_classificado1":2
          ,"nao_classificado2":3
          ,"nao_classificado3":4
          }

  Abbreviation: [1.7426] art
  Abbreviation: [1.9803] art


[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Loading Dataset

In [133]:
dfRegBr = pd.read_parquet(Path(os.path.join(BASE_DIR,'dataset','regbr','dados_regbr_sentencas.parquet')),engine="fastparquet")

In [134]:
dfRegBr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883635 entries, 0 to 883634
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   id_sentenca          883635 non-null  object
 1   ordem                883635 non-null  int64 
 2   id_doc_base          883635 non-null  int64 
 3   tokens_total         883635 non-null  int64 
 4   tokens_no_stopwords  883635 non-null  int64 
 5   cat_sentenca         883635 non-null  int64 
 6   text_sentenca        883635 non-null  object
dtypes: int64(5), object(2)
memory usage: 47.2+ MB


In [135]:
dfDocumentRegBr = dfRegBr["id_doc_base"].drop_duplicates().sample(n=250, replace=False).to_list()

In [136]:
dfSentencesDocs  = dfRegBr[dfRegBr["id_doc_base"].isin(dfDocumentRegBr)]
dfSentencesDocs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3782 entries, 552 to 880888
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_sentenca          3782 non-null   object
 1   ordem                3782 non-null   int64 
 2   id_doc_base          3782 non-null   int64 
 3   tokens_total         3782 non-null   int64 
 4   tokens_no_stopwords  3782 non-null   int64 
 5   cat_sentenca         3782 non-null   int64 
 6   text_sentenca        3782 non-null   object
dtypes: int64(5), object(2)
memory usage: 236.4+ KB


In [137]:
dfOrder = dfSentencesDocs.sort_values(['id_doc_base','ordem'],ascending=True)

##### Group Small sentences into the same document

In [138]:
minTokens = 30 
idDocument = -1
totalTokes = 0
dfTmp = pd.DataFrame()
dfOld = pd.DataFrame()

txtConcat = ""
totalTokens = 0
idSentencaConcat = ""

for index,row in dfOrder.iterrows(): 
    
    dfLocal = dfOrder.loc[[index]].copy()
    totalTokens = totalTokens + row["tokens_no_stopwords"]
    
    if idDocument == -1:
        idDocument = row["id_doc_base"]
    
    if idDocument != row["id_doc_base"]:
        dfTmp = pd.concat([dfTmp,dfLocal])
        idDocument = row["id_doc_base"]
        
        if(len(dfOld))>0:
            dfTmp = pd.concat([dfTmp,dfOld])
            dfOld = pd.DataFrame()
            
        txtConcat =""
        idSentencaConcat = ""
        totalTokens = 0

    elif totalTokens > minTokens:
        if str(txtConcat).strip() == "":
            dfTmp = pd.concat([dfTmp,dfLocal])
        else:
            txtConcat = f'{txtConcat} {row["text_sentenca"]}'
            idSentencaConcat = f'{idSentencaConcat}; {row["id_sentenca"]}'
            
            dfLocal["text_sentenca"] = str(txtConcat).strip()
            dfLocal["id_sentenca"] = str(idSentencaConcat).strip()
            dfTmp = pd.concat([dfTmp,dfLocal])
                
        txtConcat =""
        idSentencaConcat = ""
        totalTokens = 0
        dfOld = pd.DataFrame()
    else:
        txtConcat = f'{txtConcat} {row["text_sentenca"]}'
        if idSentencaConcat == "":
            idSentencaConcat = f'{row["id_sentenca"]}'
        else:
            idSentencaConcat = f'{idSentencaConcat}; {row["id_sentenca"]}'
            
        dfOld = dfLocal.copy()
        dfOld["text_sentenca"] = str(txtConcat).strip()
        dfOld["id_sentenca"] = str(idSentencaConcat).strip()   



In [139]:
dfTest = dfTmp.copy()
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2387 entries, 319420 to 118445
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_sentenca          2387 non-null   object
 1   ordem                2387 non-null   int64 
 2   id_doc_base          2387 non-null   int64 
 3   tokens_total         2387 non-null   int64 
 4   tokens_no_stopwords  2387 non-null   int64 
 5   cat_sentenca         2387 non-null   int64 
 6   text_sentenca        2387 non-null   object
dtypes: int64(5), object(2)
memory usage: 149.2+ KB


In [145]:
import torch.nn.functional as F

## Pedict
def predict(txt):
    
    txt = str(txt)
    if(len(txt)) < 50:
        return 2
    
    THRESHOLD = 0.5

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    

    encoding = tokenizer.encode_plus(
      txt,
      truncation=True,
      add_special_tokens=True,
      max_length=512,
      return_token_type_ids=False,
      padding="max_length",
      return_attention_mask=True,
      return_tensors='pt',
    )

    
    mask = encoding['attention_mask'].to(device)
    input_id = encoding['input_ids'].squeeze(1).to(device)
   
    output = model(input_id,mask)
    output.argmax(dim=1)
    
    prob = torch.nn.functional.softmax(output,dim=1)
    preds = output.detach().cpu().numpy()
    pred_val = np.argmax(preds)
    preval_prob = prob[0].tolist()[pred_val]*100
    pred_val_label = list(labels.keys())[list(labels.values()).index(pred_val)] 
    
    #result = [[idx,item] for idx,item in enumerate(prob[0].tolist())]
    #print(result)
    #print(result)
    #result = [( list(labels.keys())[list(labels.values()).index(idx)],item *100) for idx,item in enumerate(prob[0].tolist())]
    #print(result)
    #return [pred_val_label,preval_prob]
    return pred_val_label
    
    
predict(objPreProcessing.clearText("DEVERÃO SER IMPLEMENTADAS AÇÕES DE PUBLICIDADE DE UTILIDADE PÚBLICA, QUE ASSEGUREM A LISURA E IGUALDADE DE PARTICIPAÇÃO DAS ASSOCIAÇÕES E COOPERATIVAS DE CATADORES DE MATERIAIS RECICLÁVEIS NO PROCESSO DE HABILITAÇÃO."))

'terceiro_setor'

In [141]:
## This this class extenteds nn.Module and define the BertClassfier 
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [74]:
model = BertClassifier()
model.load_state_dict(torch.load(os.path.join(BASE_DIR,f'models/model_state.save_3'))["model_state_dict"])
device = torch.device("cuda")
model.to(device)


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [142]:
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2387 entries, 319420 to 118445
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_sentenca          2387 non-null   object
 1   ordem                2387 non-null   int64 
 2   id_doc_base          2387 non-null   int64 
 3   tokens_total         2387 non-null   int64 
 4   tokens_no_stopwords  2387 non-null   int64 
 5   cat_sentenca         2387 non-null   int64 
 6   text_sentenca        2387 non-null   object
dtypes: int64(5), object(2)
memory usage: 149.2+ KB


In [146]:
dfTest["cat_sentenca"] = dfTest["text_sentenca"].apply(predict)

In [147]:
dfTest["cat_sentenca"].value_counts()

outros            2337
terceiro_setor      50
Name: cat_sentenca, dtype: int64

In [148]:
dfTest[dfTest["cat_sentenca"]=='terceiro_setor'].to_excel(os.path.join(BASE_DIR,f'dataset/predicted/predict_16082022_v1.xlsx'))