In [2]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertForPreTraining
import nltk
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm
import re
import string
import torch
import torch.optim as optim

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## ***Creazione tokenizzatore LawBERT-IT***

In [3]:
# Carico BERT base

tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-italian-xxl-uncased')  
model = AutoModel.from_pretrained('dbmdz/bert-base-italian-xxl-uncased')

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def parse_text(text):

    # Remove number and date
    text = re.sub(r'[0-9]', ' ', text)
    text = re.sub('\d+\/\d+\/\d+', ' ', text)

    # Remove stopwords
    sent_text = nltk.word_tokenize(text.lower())
    return [word for word in sent_text if not word in stopwords.words('italian')]

-> Recupero dati <br />
-> Tokenizzazione frasi in parole <br />
-> Controllo che siano nel vocabolario <br />
-> Aggiunta se compaiono almeno 50 volte

In [6]:
df = pd.read_json("dataset/constitutional_court_rulings.json")

sentences = list(set(df.fatti)) + list(set(df.motivi)) + list(set(df.decisione))

new_word = []
count_new_word = []
all_words = []

for s in tqdm(sentences):
    sent_text = nltk.sent_tokenize(s)
    for sentence in sent_text:
        words = parse_text(sentence)
        all_words += words

        for w in words:
            emb = tokenizer.encode_plus(w)

            if len(emb['input_ids']) > 3:
                if w not in new_word:
                    new_word.append(w)
                    count_new_word.append(1)
                else:
                    count_new_word[new_word.index(w)] += 1

d = {"word": new_word, "num": count_new_word}
df = pd.DataFrame(d)

df = df.sort_values('num', axis=0, ascending=False)
df = df[df['num'] > 50]

100%|██████████| 702/702 [02:08<00:00,  5.46it/s]


In [7]:
ind_drop = []
word_add = []
part_of_word_to_append = []
count_part_of_word_to_append = []

# Controllo parole contententi punteggiatura (es l'ambiente -> l / ambinete)
for i, r in df.iterrows():
    punct = ""

    for p in string.punctuation:
        if p in r.word:
            punct = p
            break

    if punct != "":
        ind_drop.append(i)
        arr = r.word.split(punct)
        for el in arr:
            if len(tokenizer.encode_plus(el)['input_ids']) > 3:
                part_of_word_to_append.append(el)
                count_part_of_word_to_append.append(r.num)  

In [8]:
df = df.drop(ind_drop)
df = pd.concat([df, pd.DataFrame({"word": part_of_word_to_append, "num": count_part_of_word_to_append})]).reset_index(drop=True)

df

Unnamed: 0,word,num
0,legittimità,659
1,proc,648
2,cassazione,598
3,civ,396
4,processuali,384
...,...,...
140,omesso,53
141,enpals,53
142,indennità,53
143,inps,51


In [9]:
print("#tokens in tokenizer:", len(tokenizer))
print("Appending new tokens...")
tokenizer.add_tokens(df.word.tolist())
print("#tokens in tokenizer:", len(tokenizer))

print()
print("Modifica Embedding Matrix")
model.resize_token_embeddings(len(tokenizer))

#tokens in tokenizer: 31102
Appending new tokens...
#tokens in tokenizer: 31224

Modifica Embedding Matrix


Embedding(31224, 768)

In [10]:
tokenizer.save_pretrained('models/LawBERT-IT')
model.save_pretrained('models/LawBERT-IT')

# ***Addestramento nuove parole con NSP e MLM***

In [18]:
tokenizer = BertTokenizer.from_pretrained('models/LawBERT-IT')
model = BertForPreTraining.from_pretrained('models/LawBERT-IT')

Some weights of BertForPreTraining were not initialized from the model checkpoint at models/LawBERT-IT and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Estrazione delle frasi dalle risposte senza punteggio

In [4]:
def all_punct(s):
    for c in s:
        if c not in string.punctuation and c not in '123456789':
            return False
    return True

In [5]:
df_sent = pd.read_json("dataset/constitutional_court_rulings.json")

process = list(set(df_sent.fatti)) + list(set(df_sent.motivi)) + list(set(df_sent.decisione))
bag = []
for proc in process:
    bag += [s.lower() for s in proc.split('.') if s.strip() != '' and not all_punct(s)]

len(process), len(bag)

(702, 46200)

Preparazione dati per NSP (50% frasi consecutive, 50% no)

In [6]:
import random

sentence_a = []
sentence_b = []
label = []

for ans in process:
    sentences = [sentence for sentence in ans.split('.') if sentence != '']

    while len(sentences) > 0:

        if len(sentences) > 1:
            start = random.randint(0, len(sentences) - 2)
        else:
            start = 0

        if len(sentences) > 1 and random.random() >= 0.49:
            sentence_a.append(sentences.pop(start))
            sentence_b.append(sentences.pop(start))
            label.append(0)
        else:
            sentence_a.append(sentences[start])

            cand = bag[random.randint(0, len(bag)-1)]
            while cand == sentences[start]:
                cand = bag[random.randint(0, len(bag)-1)]

            sentences.pop(start)

            sentence_b.append(cand)
            label.append(1)

len(label), len([l for l in label if l == 0]), len([l for l in label if l == 1])

(32171, 16060, 16111)

In [7]:
# Genera gli embedding.

inputs = tokenizer(sentence_a, sentence_b, 
                   return_tensors='pt',
                   max_length=512, 
                   truncation=True, 
                   padding='max_length')

inputs['next_sentence_label'] = torch.LongTensor([label]).T

Preparazione dati per MLM (Masck del 15% dei token in input_ids)

In [8]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [9]:
def check_added_vocab(id):
    for k, i in tokenizer.get_added_vocab().items():
        if id == i:
            return True
    return False

In [10]:
SPECIAL_TOKENS = [101, 102, 103, 104]

masked = 0
num_tokens = 0

for emb in tqdm(inputs.input_ids):
    for i in range(len(emb)):
        if emb[i] == 0:
            break
        elif check_added_vocab(emb[i]) and random.uniform(0.0, 1.0) < 0.50:
            emb[i] = 104
            masked += 1
        elif emb[i] not in SPECIAL_TOKENS and random.uniform(0.0, 1.0) < 0.12:
            emb[i] = 104
            masked += 1

        num_tokens += 1

print("\nMASKED:", str(round(100*masked/num_tokens, 2)) + "%")

100%|██████████| 32171/32171 [07:10<00:00, 74.76it/s] 


MASKED: 12.66%





In [11]:
indexes = list(range(0, len(inputs.input_ids)))

random.seed(11)
random.shuffle(indexes)

train_input_ids = []
train_attention = []
train_type = []
train_nsp = []
train_label= []

test_input_ids = []
test_attention = []
test_type = []
test_nsp = []
test_label= []

for i in range(len(inputs.input_ids)):
    if i < round(0.8*len(indexes)):
        train_input_ids.append(inputs.input_ids[i])
        train_attention.append(inputs.attention_mask[i])
        train_type.append(inputs.token_type_ids[i])
        train_nsp.append(inputs.next_sentence_label[i])
        train_label.append(inputs.labels[i])

    else:
        test_input_ids.append(inputs.input_ids[i])
        test_attention.append(inputs.attention_mask[i])
        test_type.append(inputs.token_type_ids[i])
        test_nsp.append(inputs.next_sentence_label[i])
        test_label.append(inputs.labels[i])

train_inputs = {'input_ids': train_input_ids, 
                'token_type_ids': train_type, 
                'attention_mask': train_attention, 
                'next_sentence_label': train_nsp, 
                'labels': train_label}

test_inputs = {'input_ids': test_input_ids, 
               'token_type_ids': test_type, 
               'attention_mask': test_attention, 
               'next_sentence_label': test_nsp, 
               'labels': test_label}

In [12]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        
    def __len__(self):
        return len(self.encodings['input_ids'])

In [13]:
train_dataset = CustomDataset(train_inputs)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)


val_dataset= CustomDataset(test_inputs)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=True)

In [19]:
import torch.optim as optim
import copy
import numpy as np

epochs = 10
best_loss = np.Inf
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

optim = optim.AdamW(model.parameters())

for epoch in range(epochs):
    model.train()

    for batch in tqdm(train_loader):
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
       
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels,
                        return_dict=True)
        
        loss = outputs.loss
        loss.backward()
        optim.step()


    # Validazione

    model.eval()
    val_loss = 0
    val_loop = tqdm(val_loader, leave=True)

    for batch in val_loop:

        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)

        val_outputs = model(input_ids, attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            next_sentence_label=next_sentence_label,
                            labels=labels,
                            return_dict=True)

        val_loss += outputs.loss.item()

    val_loss /= len(train_loader)

    if val_loss < best_loss:
        model_copy = copy.deepcopy(model)
        best_loss = val_loss

    print('Epoch: ' + str(epoch) + " - Val loss: " + str(val_loss))

  
100%|██████████| 3218/3218 [15:56<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 0 - Val loss: 0.7711525497560993


100%|██████████| 3218/3218 [15:57<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 1 - Val loss: 0.2251476455060794


100%|██████████| 3218/3218 [15:57<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.76it/s]


Epoch: 2 - Val loss: 0.36101895841180054


100%|██████████| 3218/3218 [15:56<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 3 - Val loss: 0.3442353695501371


100%|██████████| 3218/3218 [15:56<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 4 - Val loss: 0.3946357580206274


100%|██████████| 3218/3218 [15:56<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 5 - Val loss: 0.2867971424585251


100%|██████████| 3218/3218 [15:55<00:00,  3.37it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 6 - Val loss: 0.2776728586205197


100%|██████████| 3218/3218 [15:56<00:00,  3.37it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 7 - Val loss: 0.861450067360702


100%|██████████| 3218/3218 [15:56<00:00,  3.36it/s]
100%|██████████| 805/805 [01:14<00:00, 10.78it/s]


Epoch: 8 - Val loss: 0.26131701995453677


100%|██████████| 3218/3218 [15:55<00:00,  3.37it/s]
100%|██████████| 805/805 [01:14<00:00, 10.77it/s]

Epoch: 9 - Val loss: 0.2651150921918799





In [None]:
tokenizer.save_pretrained('models/LawBERT-IT_trained')
model.save_pretrained('models/LawBERT-IT_trained')