In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import transformers
from transformers import LayoutLMv2Processor, LayoutLMForTokenClassification, AdamW, BertForTokenClassification,\
    BertTokenizer, LayoutLMTokenizer, AutoTokenizer, AutoModelForMaskedLM, pipeline, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
from torch.nn.functional import cross_entropy
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import logging
from torch.utils.data import DataLoader, Dataset
from torcheval.metrics.functional import multiclass_f1_score, multiclass_confusion_matrix
from copy import deepcopy, copy
import seaborn as sns
import pandas as pd
from huggingface_hub import notebook_login

import os 
while 'notebooks' in os.getcwd():
    os.chdir("..")

import re
from typing import List, Dict
    
from src.preprocessing.make_dataset import ImageLayoutDataset, PatternExploitingDataset, SplitWordsDataset
from src.model.mlm.trainer import MLMTrainer
import warnings
warnings.filterwarnings("ignore")

# Iterative PET

In [3]:
!mkdir logs
model_name = "iPET"
dataset_name = "FUNSD"
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.basicConfig(filename=f'logs/{model_name}_{dataset_name}.log', encoding='utf-8', level= logging.INFO)

mkdir: cannot create directory ‘logs’: File exists


## Logging in to huggingface hub

In [4]:
!pip install ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Importing models and data

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
dataset = load_dataset("nielsr/funsd")

In [6]:
def pattern0(
    phrase : str,
    word : str,
    tokenizer
):
    return f"In the phrase '{phrase}', the {word} is in the header, in a question, or in an answer? {tokenizer.mask_token}"

def pattern1(
    phrase : str, 
    word : str,
    tokenizer
):
    return f"In the phrase '{phrase}', where is the {word} is situated at? {tokenizer.mask_token}"

def pattern3(
    phrase : str,
    word : str, 
    tokenizer
):
    return f"Question: In the phrase '{phrase}', "

In [7]:
pattern_list = [pattern0, pattern1,]

In [8]:
train_datas = [
    SplitWordsDataset(
        dataset['train'],
        tokenizer,
        pattern
    )

    for pattern in pattern_list
]

100%|██████████| 149/149 [00:00<00:00, 2475.08it/s]
100%|██████████| 149/149 [00:00<00:00, 2524.32it/s]


In [15]:
test_datas =  [
    SplitWordsDataset(
        dataset['test'],
        tokenizer,
        pattern
    )

    for pattern in pattern_list
]

100%|██████████| 50/50 [00:00<00:00, 1874.63it/s]
100%|██████████| 50/50 [00:00<00:00, 2133.44it/s]


In [9]:
verbalizer = {
    "none": 0,
    "question" : 1,
    "answer": 2,
    "header" : 3,
}

## Train all models with patterns

In [16]:
device = "cuda"
models = [
    AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased").to(device)
    for i in range(len(pattern_list))
]

trainers = [
    MLMTrainer(models[i], tokenizer, verbalizer )
    for i in range(len(pattern_list))
]


In [17]:
for idx, trainer in enumerate(trainers):
    logging.info(f"Training the {idx}th model")
    train_data = train_datas[idx]

    trainer.compile(
        train_data,
        n_shots=10,

    )
    trainer.best_model.push_to_hub(f"peulsilva/ipet-pattern{idx}-10-shots")
    

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:26<00:00,  8.64s/it]
100%|██████████| 10/10 [00:56<00:00,  5.69s/it]
100%|██████████| 10/10 [01:25<00:00,  8.54s/it]
100%|██████████| 10/10 [00:56<00:00,  5.62s/it]
100%|██████████| 10/10 [01:24<00:00,  8.44s/it]
100%|██████████| 10/10 [00:54<00:00,  5.45s/it]
100%|██████████| 10/10 [01:23<00:00,  8.32s/it]
100%|██████████| 10/10 [00:54<00:00,  5.42s/it]
100%|██████████| 10/10 [01:22<00:00,  8.24s/it]
100%|██████████| 10/10 [00:54<00:00,  5.40s/it]
100%|██████████| 10/10 [01:22<00:00,  8.30s/it]
100%|██████████| 10/10 [00:53<00:00,  5.38s/it]
100%|██████████| 10/10 [01:22<00:00,  8.22s/it]
100%|██████████| 10/10 [00:54<00:00,  5.42s/it]
100%|██████████| 10/10 [01:22<00:00,  8.26s/it]
100%|██████████| 10/10 [00:53<00:00,  5.39s/it]
100%|██████████| 10/10 [01:21<00:00,  8.20s/it]
100%|██████████| 10/10 [00:53<00:00,  5.38s/it]
100%|██████████| 10/10 [01:22<00:00,  8.23s/it]
100%|██████████| 10/10 [00:53<00:00,  5.38s/it]


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 10/10 [01:22<00:00,  8.24s/it]
100%|██████████| 10/10 [00:54<00:00,  5.47s/it]
100%|██████████| 10/10 [01:22<00:00,  8.25s/it]
100%|██████████| 10/10 [00:53<00:00,  5.38s/it]
100%|██████████| 10/10 [01:23<00:00,  8.33s/it]
100%|██████████| 10/10 [00:53<00:00,  5.36s/it]
100%|██████████| 10/10 [01:21<00:00,  8.16s/it]
100%|██████████| 10/10 [00:54<00:00,  5.48s/it]
100%|██████████| 10/10 [01:21<00:00,  8.17s/it]
100%|██████████| 10/10 [00:54<00:00,  5.44s/it]
100%|██████████| 10/10 [01:22<00:00,  8.24s/it]
100%|██████████| 10/10 [00:54<00:00,  5.46s/it]
100%|██████████| 10/10 [01:21<00:00,  8.14s/it]
100%|██████████| 10/10 [00:54<00:00,  5.43s/it]
100%|██████████| 10/10 [01:21<00:00,  8.14s/it]
100%|██████████| 10/10 [00:54<00:00,  5.43s/it]
100%|██████████| 10/10 [01:21<00:00,  8.17s/it]
100%|██████████| 10/10 [00:53<00:00,  5.38s/it]
100%|██████████| 10/10 [01:22<00:00,  8.23s/it]
100%|██████████| 10/10 [00:53<00:00,  5.37s/it]


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

## Implementing iPET

In [10]:
models = [[
    AutoModelForMaskedLM.from_pretrained(f"peulsilva/ipet-pattern{i}-10-shots").to(device)
    for i in range(2)
]]

trainers = [
    MLMTrainer(models[0][i], tokenizer, verbalizer )
    for i in range(len(pattern_list))
]

In [14]:
n_models = 5
n_patterns = 2
T_ij = [deepcopy(train_datas)]
for j in range(n_models):
    logging.info(f"Evaluating dataset with {(j+1)*5} shots")

    # evaluate
    new_T = []
    for i in range(n_patterns):
        Lambda = np.random.randint(1,2)
        # Lambda*(n-1)
        trainer = MLMTrainer(models[-1][1-i], tokenizer, verbalizer)
        generated_labels ,_, _  = trainer.evaluate(
            T_ij[-1][i], 
            models[-1][1-i],
            n_shots = 10*(1+j),
            return_generated_dataset=True
        )

        logging.info("Ended generating labels")
        T_n = deepcopy(train_datas)
        for j_i in range(len(T_n[i])):
            for k in range(len(T_n[i][j_i])):
                if j_i < len(generated_labels):
                    T_n[i][j_i][k]['label'] = generated_labels[j_i][k]
        new_T.append(T_n[i])

    T_ij.append(new_T)

    logging.info(f"Training models with {(j+1)*5} shots")
    # train
    models.append([
        AutoModelForMaskedLM.from_pretrained(f"distilbert-base-uncased").to(device)
        for i in range(2)
    ])
    for i in range(n_patterns):

        best_model = None
        best_f1 = 0
        trainer = MLMTrainer(
            models[-1][i],
            tokenizer,
            verbalizer
        )

        trainer.compile(
            T_ij[-1][i],
            10*(j+1),
            n_validation=0 
        )

        y_true, y_pred = trainer.evaluate(
            train_datas[i][100:],
            models[-1][i],
            tokenizer,
            verbalizer,
            n_shots = 10
        )

        f1 = multiclass_f1_score(
            y_pred,
            y_true
        )

        logging.info(f"f1 - score: {f1}")

        if (f1 > best_f1):
            best_f1 = f1
            best_model = deepcopy(models[-1][i])

        best_model.push_to_hub(f"peulsilva/ipet-model{j+1}-pattern{i}-10-shots")

100%|██████████| 10/10 [00:44<00:00,  4.45s/it]
100%|██████████| 10/10 [00:43<00:00,  4.34s/it]
100%|██████████| 10/10 [01:22<00:00,  8.26s/it]
100%|██████████| 10/10 [00:39<00:00,  3.95s/it]
100%|██████████| 10/10 [01:23<00:00,  8.38s/it]
100%|██████████| 10/10 [00:40<00:00,  4.05s/it]
100%|██████████| 10/10 [01:25<00:00,  8.56s/it]
100%|██████████| 10/10 [00:40<00:00,  4.05s/it]
100%|██████████| 10/10 [01:25<00:00,  8.57s/it]
100%|██████████| 10/10 [00:40<00:00,  4.02s/it]
 60%|██████    | 6/10 [00:56<00:37,  9.36s/it]


KeyboardInterrupt: 