In [28]:
! pip install transformers datasets seqeval numpy tokenizers evaluate



In [29]:
import json
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from itertools import chain
from datasets import Dataset



In [30]:
with open("/content/drive/MyDrive/ColabNotebooks/train.json") as f:
  data = json.load(f)

unique_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}


In [31]:

def str2arr(arrstr):
  return arrstr.replace("'", "").replace('"', '')[1:-1].split('')

def tokenize(sample, label2id=label2id, max_length=128, model_name='microsoft/deberta-v3-base'):
  print("######")
  print(sample)
  print("######")
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  text = [t + " " for t in sample["tokens"]]
  labels = [label2id[l] for l in sample["labels"]]
  tokenized_inputs = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, is_split_into_words=True)
  labels = [labels[i] if i < len(labels) else -100 for i in range(max_length)]

  return {'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask'], 'labels': labels}


def create_model(model_name='microsoft/deberta-v3-base'):
   model = AutoModelForTokenClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=len(unique_labels), id2label=id2label, label2id=label2id)
   return model


def createTrainer(model, train_dataset, eval_dataset=None):
   training_args = TrainingArguments(
      output_dir="./results",
      num_train_epochs=3,
      per_device_train_batch_size=4,
      warmup_steps=500,
      weight_decay=0.01,
      logging_dir="./logs",
      learning_rate=5e-5,
   )

   trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)




In [None]:


tokenized_dataset = dataset.map(tokenize, batched=True)


In [27]:

text = [t + " " for t in df.iloc[0]['tokens']]
text

['Design ',
 'Thinking ',
 'for ',
 'innovation ',
 'reflexion ',
 '- ',
 'Avril ',
 '2021 ',
 '- ',
 'Nathalie ',
 'Sylla ',
 '\n\n ',
 'Challenge ',
 '& ',
 'selection ',
 '\n\n ',
 'The ',
 'tool ',
 'I ',
 'use ',
 'to ',
 'help ',
 'all ',
 'stakeholders ',
 'finding ',
 'their ',
 'way ',
 'through ',
 'the ',
 'complexity ',
 'of ',
 'a ',
 'project ',
 'is ',
 'the ',
 '  ',
 'mind ',
 'map ',
 '. ',
 '\n\n ',
 'What ',
 'exactly ',
 'is ',
 'a ',
 'mind ',
 'map ',
 '? ',
 'According ',
 'to ',
 'the ',
 'definition ',
 'of ',
 'Buzan ',
 'T. ',
 'and ',
 'Buzan ',
 'B. ',
 '( ',
 '1999 ',
 ', ',
 'Dessine ',
 '- ',
 'moi ',
 '  ',
 "l'intelligence ",
 '. ',
 'Paris ',
 ': ',
 'Les ',
 'Éditions ',
 "d'Organisation ",
 '. ',
 ') ',
 ', ',
 'the ',
 'mind ',
 'map ',
 '( ',
 'or ',
 'heuristic ',
 'diagram ',
 ') ',
 'is ',
 'a ',
 'graphic ',
 '  ',
 'representation ',
 'technique ',
 'that ',
 'follows ',
 'the ',
 'natural ',
 'functioning ',
 'of ',
 'the ',
 'mind ',
 'and