<a href="https://colab.research.google.com/github/ougrid/my-knowledge-resource/blob/master/legalact.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model

In [None]:
def split_chunk(tokenizer=None, max_length=512, stride=356, sentence=''):
  tokens = []

  for i in range(0, len(sentence), stride):
      chunk = sentence[i:i+max_length]
      tokenized_chunk = tokenizer(chunk, truncation=True, padding='max_length', return_tensors='pt')
      tokens.append(tokenized_chunk)
  return tokens

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
model_name_or_path = 'studio-ousia/mluke-base'
classifier_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, )
encoder = AutoModel.from_pretrained(model_name_or_path)
classifier = torch.nn.Sequential((list(classifier_model.children())[-1]))
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/923 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Some weights of LukeForSequenceClassification were not initialized from the model checkpoint at studio-ousia/mluke-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

entity_vocab.json:   0%|          | 0.00/439M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
class Retrieve_Chunk_Mluke(nn.Module):
  def __init__(self, encoder=None, classifier=None, tokenizer=None, chunk_function=None, chunk_max_length=256, chunk_stride=128, topk=2):
    super().__init__()

    self.device = 'cuda' if torch.cuda.is_available else 'cpu'

    # model
    self.encoder = encoder.to(self.device)
    self.classifier = classifier.to(self.device)

    #tokenizer
    self.tokenizer = tokenizer

    #chunk
    self.chunk_max_length = chunk_max_length
    self.chunk_stride = chunk_stride
    self.chunk_function = chunk_function

    self.topk = topk

    # self.learnable_feature = torch.randn(1, 768).to(self.device)

  def forward(self, sentence, legal_act, question):
    # tokenize
    tokens = self.chunk_function(sentence=sentence, max_length=self.chunk_max_length, stride=self.chunk_stride, tokenizer=self.tokenizer)
    legal_token = self.tokenizer(legal_act, truncation=True, padding='max_length', return_tensors='pt').to(self.device)
    question_token = self.tokenizer(question, truncation=True, padding='max_length', return_tensors='pt').to(self.device)

    # embedding
    tokens_embedded = torch.cat([self.encoder(**x.to(self.device)).pooler_output for x in tokens])
    legal_embedded = self.encoder(**legal_token).pooler_output
    question_embedded = self.encoder(**legal_token).pooler_output

    # add learnable_feature
    # tokens_embedded = torch.cat((tokens_embedded, self.learnable_feature))

    # softmax
    legal_simi = F.cosine_similarity(tokens_embedded, legal_embedded)
    quest_simi = F.cosine_similarity(tokens_embedded, question_embedded)

    # topk index
    k = min(tokens_embedded.size(0), self.topk)
    legal_index = torch.topk(legal_simi, k=k).indices
    quest_index = torch.topk(quest_simi, k=k).indices

    # create list of topk index
    legal_softmax = torch.softmax(torch.tensor([legal_simi[int(idx)] for idx in legal_index]), -1)
    quest_softmax = torch.softmax(torch.tensor([quest_simi[int(idx)] for idx in quest_index]), -1)

    # stack the topk outputs
    sum_top_legal_embeddings = torch.sum(torch.stack([tokens_embedded[idx] * legal_softmax[idx] for idx in legal_index]), dim=0, keepdim=True)
    sum_top_question_embeddings = torch.sum(torch.stack([tokens_embedded[idx] * quest_softmax[idx] for idx in quest_index]), dim=0, keepdim=True)

    # Concatenate the top k embeddings
    concat = torch.stack([sum_top_legal_embeddings, sum_top_question_embeddings])

    # cat & pool
    pooled, _ = torch.max(concat, dim=0, keepdim=True)

    # classify
    return self.classifier(pooled).view(-1)

In [None]:
model = Retrieve_Chunk_Mluke(encoder, classifier, tokenizer, split_chunk, chunk_max_length=128, chunk_stride=64, topk=3)

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

30

# PyTorch

## DataLoader

In [None]:
import pandas as pd
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [None]:
from torch.utils.data import Dataset, DataLoader

class ContextRetrieveClassifierTrainDataset(Dataset):
    def __init__(self, dataframe=None):
        self.dataframe = dataframe


    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        items = { 'sentence': str(row['context']), 'legal_act': str(row['legal_act']), 'question': str(row['question']) , 'label': F.one_hot(torch.tensor(row['answer']), 2).float()}
        return items

class ContextRetrieveClassifierTestDataset(Dataset):
    def __init__(self, dataframe=None):
        self.dataframe = dataframe


    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        items = { 'sentence': str(row['context']), 'legal_act': str(row['legal_act']), 'question': str(row['question']) }
        return items

In [None]:
train_dataset = ContextRetrieveClassifierTrainDataset(train_df)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)

val_dataset = ContextRetrieveClassifierTrainDataset(val_df)
val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=1)

test_dataset = ContextRetrieveClassifierTestDataset(test_df)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=1)

## Train Loop

In [None]:
from tqdm import tqdm
from torch.optim import AdamW

torch.backends.cudnn.benchmark =  True
torch.backends.cudnn.enabled =  True

device = 'cuda' if torch.cuda.is_available else 'cpu'

# Fine-tuning parameters
num_epochs = 10
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

model.to(device)
scaler = torch.cuda.amp.GradScaler()

# Fine-tuning loop
for epoch in range(num_epochs):
    # Training
    model.train()
    for idx, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
      torch.cuda.empty_cache()

      context = batch.pop('sentence')
      legal_act = batch.pop('legal_act')
      question = batch.pop('question')
      answer = batch.pop('label')

      outputs = model(context, legal_act, question)
      loss = loss_fn(outputs, answer.view(-1).to(device))
      scaler.scale(loss).backward()

      if (idx % 8) == 0:
        scaler.step(optimizer)
        optimizer.zero_grad()
        scaler.update()

    # Validation
    model.eval()
    with torch.no_grad():
      for idx, batch in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        context = batch.pop('sentence')
        legal_act = batch.pop('legal_act')
        question = batch.pop('question')
        answer = batch.pop('label')

        val_outputs = model(context, legal_act, question)
        val_loss = loss_fn(val_outputs, answer.view(-1).to(device))
#         f1_score = F.multiclass_f1_score(outputs, answer)
      if (val_loss.item() <= loss.item()) or (val_loss.item() <= 0.05):
        torch.save(model, '/content/model.pt')
      print(f"Epoch {epoch+1}/{num_epochs}: Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

100%|██████████| 3321/3321 [10:15<00:00,  5.40it/s]
100%|██████████| 1108/1108 [00:50<00:00, 21.93it/s]


Epoch 1/10: Loss: 0.1944, Val Loss: 0.2885


  5%|▍         | 158/3321 [00:29<09:44,  5.42it/s]


KeyboardInterrupt: 

## Submit

In [None]:
import torch

In [None]:
submit_df = pd.read_csv('/content/sample_submission.csv')

In [None]:
# Model class must be defined somewhere
model = torch.load('/content/model.pt')
model.eval()

In [None]:
predicts = []
with torch.no_grad():
      for idx, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        context = batch.pop('sentence')
        legal_act = batch.pop('legal_act')
        question = batch.pop('question')

        predict = model(context, legal_act, question)
        predicts.append(int(torch.argmax(predict, -1).to('cpu')))

In [None]:
submit_df['answer'] = predicts
submit_df.to_csv('/content/submission.csv', index=False)

In [None]:
# !kaggle competitions submit -c legal-act-classification -f submission.csv -m "Message"