<a href="https://colab.research.google.com/github/meti-94/TextClassification/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://guillaumejaume.github.io/FUNSD/dataset.zip
!unzip -q dataset 
!pip install -q transformers

--2021-09-25 13:49:24--  https://guillaumejaume.github.io/FUNSD/dataset.zip
Resolving guillaumejaume.github.io (guillaumejaume.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to guillaumejaume.github.io (guillaumejaume.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16838830 (16M) [application/zip]
Saving to: ‘dataset.zip’


2021-09-25 13:49:24 (133 MB/s) - ‘dataset.zip’ saved [16838830/16838830]

[K     |████████████████████████████████| 2.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
[K     |████████████████████████████████| 895 kB 42.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 29.9 MB/s 
[K     |████████████████████████████████| 636 kB 49.3 MB/s 
[?25h

In [2]:
!pip install -q pyyaml==5.1
# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
!pip install -q torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

# install detectron2 that matches pytorch 1.8
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
!pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
# exit(0)  # After installation, you need to "restart runtime" in Colab. This line can also restart runtime

[?25l[K     |█▏                              | 10 kB 19.9 MB/s eta 0:00:01[K     |██▍                             | 20 kB 10.5 MB/s eta 0:00:01[K     |███▋                            | 30 kB 8.7 MB/s eta 0:00:01[K     |████▉                           | 40 kB 8.2 MB/s eta 0:00:01[K     |██████                          | 51 kB 5.1 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 5.6 MB/s eta 0:00:01[K     |████████▍                       | 71 kB 5.5 MB/s eta 0:00:01[K     |█████████▋                      | 81 kB 6.1 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 4.9 MB/s eta 0:00:01[K     |████████████                    | 102 kB 5.1 MB/s eta 0:00:01[K     |█████████████▏                  | 112 kB 5.1 MB/s eta 0:00:01[K     |██████████████▍                 | 122 kB 5.1 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 5.1 MB/s eta 0:00:01[K     |████████████████▊               | 143 kB 5.1 MB/s eta 0:00:01[K   

In [4]:
!pip install -q datasets==1.9.0

In [15]:
from torch.utils.data import Dataset
import glob
from os.path import join
from PIL import Image
import json
import sys
from transformers import AutoTokenizer
from transformers import LayoutLMv2Processor
from tqdm import tqdm 
import numpy as np 
import torch
from datasets import Features, Sequence, Value, Array2D, Array3D
import pandas as pd
from datasets import Dataset
import datasets


model_checkpoint = "microsoft/layoutlmv2-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
processor = LayoutLMv2Processor.from_pretrained(model_checkpoint, revision="no_ocr")


def read_image(image_path=None):
  '''Reading image into PIL object for future use'''
  image = Image.open(image_path).convert("RGB")
  return image

def normalize_bbox(image, bboxes):
    '''bboxes Normalization according to the expected input of the model'''
    width, height = image.size
    normalized_bboxes = []
    for bbox in bboxes:
      for part in bbox: 
        normalized_bboxes.append([
                                  int(1000 * (part[0] / width)),
                                  int(1000 * (part[1] / height)),
                                  int(1000 * (part[2] / width)),
                                  int(1000 * (part[3] / height)),
                                  ])
    return normalized_bboxes

def flatten_annotations(annotation):
  '''Reading the annotation file related to each image and extract the maximum number of complete questions (Question & Answer) from it'''
  form = json.load(open(annotation, 'r'))['form']
  words, bboxes, questions, answers = [], [], [], []
  for idx, semantic_entity in enumerate(form):
    if semantic_entity['label']=="question":
      temp_question = [word['text'] for word in semantic_entity['words']]
      links = semantic_entity['linking']
      if len(links)==1:
        links = links[0]
        if links[0]==idx:
          temp_answer = [word['text'] for word in form[links[1]]['words']]
          questions.append(temp_question)
          answers.append(temp_answer)
    temp_words = [word['text'] for word in semantic_entity['words']] 
    temp_bboxes = [word['box'] for word in semantic_entity['words']]
    words.append(temp_words)
    bboxes.append(temp_bboxes)
  return words, bboxes, questions, answers



class FUNSDDataset(Dataset):
  '''Wrapper class for creating datasets related to this task (transparency)'''
  def __init__(self, data_path=None):
    self.annotations = sorted(glob.glob(join(data_path, "annotations/*.json")))
    self.images = sorted(glob.glob(join(data_path, "images/*.png")))
    # self.data = []
    image_list, question_list, word_list, bbox_list, start_list, end_list= [], [], [], [], [], []
    for img, ann in tqdm(zip(self.images, self.annotations), total=len(self.images)):
      image = read_image(img)
      words, bboxes, questions, answers = flatten_annotations(ann)
      bboxes = normalize_bbox(image, bboxes)
      flatten_words, flatten_bboxes = [], []
      for word, bbox in zip(words, bboxes):
        flatten_words+=[item for item in word]
      assert len(flatten_words)==len(bboxes)
      
        
      for idx, (question, answer) in enumerate(zip(questions, answers)):
        index = words.index(answer)
        offset = len(tokenizer.tokenize(' '.join(question))) + 2
        start_pos = 0
        for _idx in range(index):
          start_pos+=len(tokenizer.tokenize(' '.join(words[_idx])))
        start_pos+=offset
        end_pos=start_pos+len(tokenizer.tokenize(' '.join(words[index])))

        image_list.append(image)
        question_list.append(' '.join(question)),
        word_list.append(flatten_words), 
        bbox_list.append(bboxes)
        start_list.append(start_pos)
        end_list.append(end_pos)
    

    
    self.encoding = processor(image_list, 
                         question_list, 
                         word_list, 
                         boxes=bbox_list,
                         padding=True,
                         truncation=True, 
                         return_tensors="pt")
    self.encoding['start_positions'] = torch.tensor(start_list, dtype=torch.int64)
    self.encoding['end_positions'] = torch.tensor(end_list, dtype=torch.int64)
 

  def __getitem__(self, idx):
    return (
            self.encoding['image'][idx],
            self.encoding['input_ids'][idx],
            self.encoding['token_type_ids'][idx],
            self.encoding['attention_mask'][idx],
            self.encoding['bbox'][idx],
            self.encoding['start_positions'][idx],
            self.encoding['end_positions'][idx],
            
            )
  def __len__(self):
    return len(self.encoding['image'])



  


    

In [16]:
from torch.utils.data import DataLoader
from transformers import AutoModelForQuestionAnswering
from torch.utils.data import random_split


model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

train_ds = FUNSDDataset('./dataset/training_data')
train_ds, valid_ds = random_split(train_ds, [int(len(train_ds)*85/100),
                                            len(train_ds)-int(len(train_ds)*85/100)])
test_ds = FUNSDDataset('./dataset/testing_data')

train_dl = DataLoader(train_ds, batch_size=2, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=2, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=2, shuffle=False)



Some weights of the model checkpoint at microsoft/layoutlmv2-base-uncased were not used when initializing LayoutLMv2ForQuestionAnswering: ['layoutlmv2.visual.backbone.bottom_up.res3.2.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.6.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.14.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.9.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.18.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.10.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.18.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.0.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res5.1.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res3.2.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.13.conv1.norm.num_batches_tracked'

In [None]:
from transformers import AdamW
from tqdm import notebook

logging_step = 20
validation_step = 100
max_step = 1000

optimizer = AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.train()
global_step = 0
losses = []
overfit = False

for epoch in range(20):  # loop over the dataset multiple times
   for idx, batch in notebook.tqdm(enumerate(train_dl), total=len(train_dl), desc='Training'):
      image, input_ids, token_type_ids, attention_mask, bbox, start_positions, end_positions = batch
      # get the inputs;
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)
      token_type_ids = token_type_ids.to(device)
      bbox = bbox.to(device)
      image = image.to(device)
      start_positions = start_positions.to(device)
      end_positions = end_positions.to(device)

      # zero the parameter gradients
      optimizer.zero_grad()

      outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                token_type_ids=token_type_ids,
                bbox=bbox, 
                image=image, 
                start_positions=start_positions, 
                end_positions=end_positions
                )
      loss = outputs.loss
      losses.append(loss)
      loss.backward()
      optimizer.step()
      global_step+=1
      if global_step%logging_step==0:
        print(f"\nCurrent Step Loss : {loss}\t Total Loss Amount: {sum(losses)/len(losses)}")
        
      if global_step%validation_step==0:
        validation_losses= []
        model.eval()
        for batch in notebook.tqdm(valid_dl, desc="Validation", total=len(valid_dl)):
          with torch.no_grad():
            image, input_ids, token_type_ids, attention_mask, bbox, start_positions, end_positions = batch
            # get the inputs;
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            bbox = bbox.to(device)
            image = image.to(device)
            start_positions = start_positions.to(device)
            end_positions = end_positions.to(device)
            outputs = model(
                        input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        token_type_ids=token_type_ids,
                        bbox=bbox, 
                        image=image, 
                        start_positions=start_positions, 
                        end_positions=end_positions
                      )
            loss = outputs.loss
            validation_losses.append(loss)
        print(f"\nValidation Loss : {sum(validation_losses)/len(validation_losses)}")
      if global_step==max_step:
        break 

Training:   0%|          | 0/538 [00:00<?, ?it/s]


Current Step Loss : 5.146202087402344	 Total Loss Amount: 5.84937858581543

Current Step Loss : 4.546136856079102	 Total Loss Amount: 5.38253927230835

Current Step Loss : 4.432736396789551	 Total Loss Amount: 5.199274063110352

Current Step Loss : 5.144530296325684	 Total Loss Amount: 5.130845546722412

Current Step Loss : 5.335842132568359	 Total Loss Amount: 5.033699035644531


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 4.631657123565674

Current Step Loss : 4.287839889526367	 Total Loss Amount: 4.957679271697998

Current Step Loss : 4.477996826171875	 Total Loss Amount: 4.876849174499512

Current Step Loss : 4.087594985961914	 Total Loss Amount: 4.808403491973877

Current Step Loss : 3.9338085651397705	 Total Loss Amount: 4.7309088706970215

Current Step Loss : 4.718257904052734	 Total Loss Amount: 4.6870245933532715


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 4.21128511428833

Current Step Loss : 3.838202476501465	 Total Loss Amount: 4.632599830627441

Current Step Loss : 3.5602810382843018	 Total Loss Amount: 4.598925590515137

Current Step Loss : 4.1642584800720215	 Total Loss Amount: 4.557181358337402

Current Step Loss : 3.8690104484558105	 Total Loss Amount: 4.525932312011719

Current Step Loss : 3.9017961025238037	 Total Loss Amount: 4.48093318939209


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 3.8732082843780518

Current Step Loss : 3.8893039226531982	 Total Loss Amount: 4.452239036560059

Current Step Loss : 3.8251774311065674	 Total Loss Amount: 4.425135612487793

Current Step Loss : 3.546475887298584	 Total Loss Amount: 4.408913612365723

Current Step Loss : 3.364604949951172	 Total Loss Amount: 4.379490375518799

Current Step Loss : 3.5814926624298096	 Total Loss Amount: 4.353222370147705


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 3.8058884143829346

Current Step Loss : 4.138347625732422	 Total Loss Amount: 4.327860355377197

Current Step Loss : 3.6083922386169434	 Total Loss Amount: 4.304250240325928

Current Step Loss : 3.5692577362060547	 Total Loss Amount: 4.2671051025390625

Current Step Loss : 3.573460578918457	 Total Loss Amount: 4.241180419921875

Current Step Loss : 2.4295260906219482	 Total Loss Amount: 4.197805404663086


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 2.892900228500366

Current Step Loss : 3.7299537658691406	 Total Loss Amount: 4.146336078643799


Training:   0%|          | 0/538 [00:00<?, ?it/s]


Current Step Loss : 2.522223949432373	 Total Loss Amount: 4.098518371582031

Current Step Loss : 1.7866212129592896	 Total Loss Amount: 4.050302982330322

Current Step Loss : 1.987915277481079	 Total Loss Amount: 4.00130033493042

Current Step Loss : 2.5092310905456543	 Total Loss Amount: 3.9450502395629883


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 2.5260093212127686

Current Step Loss : 2.5057153701782227	 Total Loss Amount: 3.89431095123291

Current Step Loss : 2.6079866886138916	 Total Loss Amount: 3.853663682937622

Current Step Loss : 2.0662550926208496	 Total Loss Amount: 3.813488006591797

Current Step Loss : 1.8702971935272217	 Total Loss Amount: 3.76977801322937

Current Step Loss : 1.6485062837600708	 Total Loss Amount: 3.738530158996582


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 2.2602763175964355

Current Step Loss : 3.9583067893981934	 Total Loss Amount: 3.6901676654815674

Current Step Loss : 1.414733648300171	 Total Loss Amount: 3.658895492553711

Current Step Loss : 2.598820686340332	 Total Loss Amount: 3.6219875812530518

Current Step Loss : 1.6181442737579346	 Total Loss Amount: 3.5755741596221924

Current Step Loss : 1.5763241052627563	 Total Loss Amount: 3.5324089527130127


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 2.1662447452545166

Current Step Loss : 1.4150452613830566	 Total Loss Amount: 3.5070242881774902

Current Step Loss : 3.6413304805755615	 Total Loss Amount: 3.4675536155700684

Current Step Loss : 3.854444742202759	 Total Loss Amount: 3.4395334720611572

Current Step Loss : 3.9362101554870605	 Total Loss Amount: 3.4043478965759277

Current Step Loss : 0.3715912103652954	 Total Loss Amount: 3.3595223426818848


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 1.582265019416809

Current Step Loss : 3.441511392593384	 Total Loss Amount: 3.325178384780884

Current Step Loss : 1.8745851516723633	 Total Loss Amount: 3.2885122299194336

Current Step Loss : 1.6150054931640625	 Total Loss Amount: 3.252798080444336

Current Step Loss : 3.092306137084961	 Total Loss Amount: 3.2173728942871094

Current Step Loss : 0.6550390720367432	 Total Loss Amount: 3.186917781829834


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 1.3967273235321045


Training:   0%|          | 0/538 [00:00<?, ?it/s]


Current Step Loss : 1.260823130607605	 Total Loss Amount: 3.145381212234497

Current Step Loss : 0.8233095407485962	 Total Loss Amount: 3.105252981185913

Current Step Loss : 0.1752593219280243	 Total Loss Amount: 3.064014196395874

Current Step Loss : 0.23772740364074707	 Total Loss Amount: 3.0306508541107178

Current Step Loss : 3.8998873233795166	 Total Loss Amount: 2.9922707080841064


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 1.0711238384246826

Current Step Loss : 1.0071219205856323	 Total Loss Amount: 2.9544894695281982

Current Step Loss : 0.7112074494361877	 Total Loss Amount: 2.9186477661132812

Current Step Loss : 2.993238925933838	 Total Loss Amount: 2.8851497173309326

Current Step Loss : 0.5936439037322998	 Total Loss Amount: 2.847911834716797

Current Step Loss : 1.3485641479492188	 Total Loss Amount: 2.8201966285705566


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 1.122207522392273

Current Step Loss : 4.272470474243164	 Total Loss Amount: 2.786332607269287

Current Step Loss : 3.4399406909942627	 Total Loss Amount: 2.758347749710083

Current Step Loss : 0.9427847862243652	 Total Loss Amount: 2.7324724197387695

Current Step Loss : 0.20255303382873535	 Total Loss Amount: 2.70450496673584

Current Step Loss : 1.0132272243499756	 Total Loss Amount: 2.6828725337982178


Validation:   0%|          | 0/95 [00:00<?, ?it/s]


Validation Loss : 1.0179753303527832

Current Step Loss : 0.38098469376564026	 Total Loss Amount: 2.655363082885742

Current Step Loss : 1.344041109085083	 Total Loss Amount: 2.6261889934539795

Current Step Loss : 0.7111283540725708	 Total Loss Amount: 2.6156442165374756

Current Step Loss : 0.14259693026542664	 Total Loss Amount: 2.5874650478363037

Current Step Loss : 0.07731502503156662	 Total Loss Amount: 2.558445692062378


Validation:   0%|          | 0/95 [00:00<?, ?it/s]

In [None]:
actual_starts = []
actual_ends = []
predicted_starts = []
predicted_ends = []
model.eval()
for batch in notebook.tqdm(test_dl, desc="Testing"):
  with torch.no_grad():
    image, input_ids, token_type_ids, attention_mask, bbox, start_positions, end_positions = batch
    # get the inputs;
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    bbox = bbox.to(device)
    image = image.to(device)
    start_positions = start_positions.to(device)
    end_positions = end_positions.to(device)

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = model(
              input_ids=input_ids, 
              attention_mask=attention_mask, 
              token_type_ids=token_type_ids,
              bbox=bbox, 
              image=image, 
              # start_positions=start_positions, 
              # end_positions=end_positions
              )
    # step 3: get start_logits and end_logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    predicted_start_idx = torch.argmax(start_logits, dim=1).cpu().detach().numpy().tolist()
    predicted_end_idx = torch.argmax(end_logits, dim=1).cpu().detach().numpy().tolist()
    actual_start_idx = start_positions.cpu().detach().numpy().tolist()
    actual_end_idx = end_positions.cpu().detach().numpy().tolist()
    
    predicted_starts+=predicted_start_idx
    predicted_ends+=predicted_end_idx
    actual_starts+=actual_start_idx
    actual_ends+=actual_end_idx

counter=0
for idx in range(len(test_ds)):
  if (actual_starts[idx]==predicted_starts[idx]
      and actual_ends[idx]==predicted_ends[idx]):
    counter+=1
print(f'Accuracy on Span Detection: {counter/len(test_ds)}')

    



Testing:   0%|          | 0/226 [00:00<?, ?it/s]

Accuracy on Span Detection: 0.5486725663716814
