## ARTIFICIAL INTELLIGENCE ON LEGAL LANGUAGE PROCESSING: USING DEEP LEARNING TO FOUND THE REGULATORY LAW FRAMEWORK FOR THE THIRD SECTOR

### Mauricio Barros de Jesus - mauriciobajesus@gmail.com 
### McCormick & Ryan (2019) inspired the source code.  Chris McCormick and Nick Ryan. (2019, July 22). BERT Fine-Tuning Tutorial with PyTorch. Retrieved from - https://mccormickml.com/2019/07/22/BERT-fine-tuning/


In [2]:
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch import nn
import numpy as np
import pandas as pd
from torch.optim import Adam
from tqdm import tqdm
import re
import unicodedata
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import rc
import os
from pathlib import Path
from sklearn.utils import resample, shuffle
import random

import seaborn as sns

from sklearn.metrics import mean_absolute_error, matthews_corrcoef, auc, roc_curve
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score, accuracy_score, f1_score, recall_score 

import time
import datetime

import warnings
warnings.filterwarnings('ignore')

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [5]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)

In [6]:
BASE_DIR = Path(os.path.abspath('')).resolve()
batch_size = 32
max_length_bert = 512

In [7]:
df_regbr= pd.read_parquet(Path(os.path.join(BASE_DIR, 'dataset','regbr','dados_regbr_sentencas_v3.parquet')),engine='fastparquet')


In [8]:
df_regbr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330190 entries, 0 to 1330189
Data columns (total 5 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   id_sentenca          1330190 non-null  object
 1   id_doc_base          1330190 non-null  int64 
 2   tokens_total         1330190 non-null  int64 
 3   tokens_no_stopwords  1330190 non-null  int64 
 4   text_sentenca        1330190 non-null  object
dtypes: int64(3), object(2)
memory usage: 50.7+ MB


In [9]:
df_regbr["cat"] = 3

In [10]:
model = BertForSequenceClassification.from_pretrained(
    'neuralmind/bert-base-portuguese-cased', #"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.load_state_dict(torch.load(os.path.join(BASE_DIR,'models',f'model_state.save_3'))["model_state_dict"])
device = torch.device("cuda")
model.to(device)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
def predictSentence(sentences,labels):
    # Report the number of sentences.
    #print('Number of test sentences: {:,}\n'.format(len(sentences)))

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in sentences:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 512,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    # Create the DataLoader.
    prediction_data = TensorDataset(input_ids, attention_masks, labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    
    #print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels, pred_vals = [], [], []

    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)


        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        pred_vals.append(pred_flat)

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

    #print('DONE.')
    #return predictions
    return {"pred_vals":pred_vals,"predictions":predictions,"true_labels":true_labels}

In [12]:
sentences = df_regbr["text_sentenca"].to_list()
labels = df_regbr["cat"].to_list()
resp = predictSentence(sentences,labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
df_regbr["cat"] = np.concatenate(resp["pred_vals"])

In [35]:
setencesFP = [
    '41652004_SENT_752750'
,'3411966_SENT_8433868'
,'5351979_SENT_6295772'
,'42132004_SENT_3986864'
,'42871990_SENT_3600713'
,'47272016_SENT_1952602'
,'48022017_SENT_8729859'
,'69191993_SENT_3719152'
,'281581991_SENT_3444194'
,'289091994_SENT_1502504'
,'297901999_SENT_5875010'
,'610411994_SENT_5111085'
,'626631998_SENT_6829088'
,'627081998_SENT_7480837'
,'653522005_SENT_6431382'
,'655132005_SENT_4616080'
,'656222005_SENT_2523068'
,'682422014_SENT_5573007'
,'685792015_SENT_1673217'
,'689752017_SENT_8788943'
,'691492017_SENT_654702'
,'692212017_SENT_5100941'
,'2139032019_SENT_7673157'
,'2141132020_SENT_4313056'
,'6102532020_SENT_8307676'
,'6102532020_SENT_1258393'
,'42186162001_SENT_9965428']

In [46]:
df_regbr.loc[df_regbr["id_sentenca"].isin(setencesFP), "cat"] = 0

In [47]:
df_regbr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330190 entries, 0 to 1330189
Data columns (total 6 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   id_sentenca          1330190 non-null  object
 1   id_doc_base          1330190 non-null  int64 
 2   tokens_total         1330190 non-null  int64 
 3   tokens_no_stopwords  1330190 non-null  int64 
 4   text_sentenca        1330190 non-null  object
 5   cat                  1330190 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 60.9+ MB


In [48]:
df_regbr.to_parquet(Path(os.path.join(BASE_DIR,'dataset','results','dados_regbr_sentencas_predicted_25092022_v4.parquet')),engine="fastparquet")


In [49]:
df_regbr["cat"].value_counts()

0    1327679
1       2511
Name: cat, dtype: int64

In [50]:
dfThirdSector = df_regbr[df_regbr["cat"] == 1].copy()
dfNotThirdSector = df_regbr[df_regbr["cat"] == 0].copy()

In [51]:
dfThirdSector[["id_doc_base"]].drop_duplicates().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 710 entries, 15318 to 1328866
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   id_doc_base  710 non-null    int64
dtypes: int64(1)
memory usage: 11.1 KB


In [52]:
dfThirdSector.to_parquet(Path(os.path.join(BASE_DIR,'dataset','results','dados_regbr_sentencas_predicted_25092022_third_sec_v4.parquet')),engine="fastparquet")
dfThirdSector.to_excel(Path(os.path.join(BASE_DIR,'dataset','results','dados_regbr_sentencas_predicted_25092022_third_sec_v4.xlsx')))


In [53]:
dfThirdSectorQtdOcorrencias = pd.DataFrame(dfThirdSector["id_doc_base"].value_counts()).reset_index()
dfThirdSectorQtdOcorrencias.rename(columns={"index":"id_doc_base2","id_doc_base":'qtd_terc_setor'},inplace=True)
dfThirdSectorQtdOcorrencias.rename(columns={"id_doc_base2":"id_doc_base","qtd_terc_setor":'qtd_terc_setor'},inplace=True)
dfThirdSectorQtdOcorrencias.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id_doc_base     710 non-null    int64
 1   qtd_terc_setor  710 non-null    int64
dtypes: int64(2)
memory usage: 11.2 KB


In [54]:
dfNotThirdSectorQtdOcorrencias = pd.DataFrame(dfNotThirdSector["id_doc_base"].value_counts()).reset_index()
dfNotThirdSectorQtdOcorrencias.rename(columns={"index":"id_doc_base2","id_doc_base":'qtd_not_terc_setor'},inplace=True)
dfNotThirdSectorQtdOcorrencias.rename(columns={"id_doc_base2":"id_doc_base","qtd_not_terc_setor":'qtd_not_terc_setor'},inplace=True)
dfNotThirdSectorQtdOcorrencias.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50994 entries, 0 to 50993
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   id_doc_base         50994 non-null  int64
 1   qtd_not_terc_setor  50994 non-null  int64
dtypes: int64(2)
memory usage: 796.9 KB


In [56]:
dfThirdSectorQtdOcorrencias.head(2)

Unnamed: 0,id_doc_base,qtd_terc_setor
0,2130192014,259
1,687262016,157


In [57]:
dfResumoOcorrencias = dfNotThirdSectorQtdOcorrencias.merge(dfThirdSectorQtdOcorrencias, on="id_doc_base",how="outer")

In [58]:
dfResumoOcorrencias.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50995 entries, 0 to 50994
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id_doc_base         50995 non-null  int64  
 1   qtd_not_terc_setor  50994 non-null  float64
 2   qtd_terc_setor      710 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 1.6 MB


In [59]:
dfResumoOcorrencias.fillna(0,inplace=True)

In [60]:
dfResumoOcorrencias["metrica"] = dfResumoOcorrencias.apply(lambda row: row["qtd_terc_setor"]/(row["qtd_terc_setor"] + row["qtd_not_terc_setor"]), axis=1)

In [61]:
dfResumoOcorrencias.sort_values(by="metrica",ascending=False,inplace=True)


In [62]:
def calcFaixa(metrica):
    faixa = 'Non Third Sector'
    if metrica >= 0.6:
        faixa = 'Focus on Third Sector'
    elif  metrica >= 0.3:
        faixa = 'Addresses Third Sector'
    elif  metrica > 0:
        faixa = 'Mentions to Third Sector'
        
    return faixa

In [63]:
dfResumoOcorrencias["faixa"] = dfResumoOcorrencias["metrica"].apply(calcFaixa)


In [64]:
pd.DataFrame(dfResumoOcorrencias["faixa"].value_counts()).reset_index()

Unnamed: 0,index,faixa
0,Non Third Sector,50285
1,Mentions to Third Sector,678
2,Addresses Third Sector,29
3,Focus on Third Sector,3


In [65]:
dfResumoOcorrencias[dfResumoOcorrencias["metrica"]>=0.5].head(100)

Unnamed: 0,id_doc_base,qtd_not_terc_setor,qtd_terc_setor,metrica,faixa
50994,6109152022,0.0,1.0,1.0,Focus on Third Sector
23432,675922011,8.0,15.0,0.652174,Focus on Third Sector
42569,663082007,5.0,9.0,0.642857,Focus on Third Sector
47913,2142152022,3.0,4.0,0.571429,Addresses Third Sector
46334,294291996,4.0,5.0,0.555556,Addresses Third Sector
8710,4164871998,27.0,33.0,0.55,Addresses Third Sector
12299,282461991,17.0,20.0,0.540541,Addresses Third Sector
5358,691902017,49.0,53.0,0.519608,Addresses Third Sector


In [66]:
dfResumoOcorrencias.to_excel(Path(os.path.join(BASE_DIR,'dataset','results','dados_regbr_sentencas_resumo_ocorrencias_25092022_v4.xlsx')))


In [67]:

dfResumoOcorrencias[dfResumoOcorrencias["metrica"]>0].to_excel(Path(os.path.join(BASE_DIR,'dataset','results','dados_regbr_sentencas_resumo_ocorrencias_third_sector_25092022_v4.xlsx')))


In [68]:
dfResumoOcorrencias[dfResumoOcorrencias["metrica"]>0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 710 entries, 50994 to 1
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id_doc_base         710 non-null    int64  
 1   qtd_not_terc_setor  710 non-null    float64
 2   qtd_terc_setor      710 non-null    float64
 3   metrica             710 non-null    float64
 4   faixa               710 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 33.3+ KB


In [69]:
df_regbr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330190 entries, 0 to 1330189
Data columns (total 6 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   id_sentenca          1330190 non-null  object
 1   id_doc_base          1330190 non-null  int64 
 2   tokens_total         1330190 non-null  int64 
 3   tokens_no_stopwords  1330190 non-null  int64 
 4   text_sentenca        1330190 non-null  object
 5   cat                  1330190 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 60.9+ MB
