In [1]:
import transformers
import json
import os
import torch
transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.34.0'

### Do some explore for the data

In [1]:
import json
import pandas as pd
from datasets import Dataset, load_from_disk
import torch
import os

from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

file_name = 'category_descriptions.csv'

df = pd.read_csv(file_name)

df.head()

Unnamed: 0,Category (incl. context and answer),Description,Answer Format,Group
0,Category: Document Name,Description: The name of the contract,Answer Format: Contract Name,Group: -
1,Category: Parties,Description: The two or more parties who signe...,Answer Format: Entity or individual names,Group: -
2,Category: Agreement Date,Description: The date of the contract,Answer Format: Date (mm/dd/yyyy),Group: 1
3,Category: Effective Date,Description: The date when the contract is eff...,Answer Format: Date (mm/dd/yyyy),Group: 1
4,Category: Expiration Date,Description: On what date will the contract's ...,Answer Format: Date (mm/dd/yyyy) / Perpetual,Group: 1


In [3]:
df.shape

(41, 4)

In [4]:
# what data type contains.
# this is all the supported clauses
types = [x.split(':')[-1].strip() for x in df['Category (incl. context and answer)'].values.tolist()]
print(types)

['Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date', 'Renewal Term', 'Notice Period to Terminate Renewal', 'Governing Law', 'Most Favored Nation', 'Non-Compete', 'Exclusivity', 'No-Solicit of Customers', 'Competitive Restriction Exception', 'No-Solicit of Employees', 'Non-Disparagement', 'Termination for Convenience', 'Rofr/Rofo/Rofn', 'Change of Control', 'Anti-Assignment', 'Revenue/Profit Sharing', 'Price Restrictions', 'Minimum Commitment', 'Volume Restriction', 'IP Ownership Assignment', 'Joint IP Ownership', 'License Grant', 'Non-Transferable License', 'Affiliate License-Licensor', 'Affiliate License-Licensee', 'Unlimited/All-You-Can-Eat-License', 'Irrevocable or Perpetual License', 'Source Code Escrow', 'Post-Termination Services', 'Audit Rights', 'Uncapped Liability', 'Cap on Liability', 'Liquidated Damages', 'Warranty Duration', 'Insurance', 'Covenant Not to Sue', 'Third Party Beneficiary']


In [6]:
with open('clauses_support.txt', 'w') as f:
    for t in types:
        f.write(t + '\n')

In [7]:
model_id ="distilbert-base-cased-distilled-squad"
max_length = 384
stride = 128

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForQuestionAnswering.from_pretrained(model_id)

# put model to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)


# get current folder path
# cur_path = os.path.abspath(__file__)
cur_path = os.path.abspath(os.curdir)

In [8]:
def load_json(file_name):
    with open(file_name, 'r') as f:
        data = json.loads(f.read())
    return data

# let's create a func to make the real data with 
def get_trained_data(training_data_path='sample.json'):
    train_data = load_json(training_data_path)

    real_train_ds = []
    for i, d in enumerate(train_data['data']):
        ps = d['paragraphs']
        for p in ps:
            context = p['context']
            for qas in p['qas']:
                qas['context'] = context
                qas.pop('is_impossible', '')
                if qas['answers'] == 0:
                    continue
                if len(qas['answers'])  >= 1:
                    # print(qas['answers'])
                    tmp_ans = qas['answers'][0]
                    tmp_ans['text'] = [tmp_ans['text']]
                    tmp_ans['answer_start'] = [tmp_ans['answer_start']]
                    qas['answers'] = tmp_ans
                
                real_train_ds.append(qas) 

    real_train_ds = [x for x in real_train_ds if x['answers'] != []]
    return real_train_ds


def _get_dataset(real_train_ds):
    df = pd.DataFrame(real_train_ds)

    dataset = Dataset.from_pandas(df)
    print(len(dataset))
    return dataset


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


def get_dataset(json_path, data_path='tmp_data'):
    if os.path.exists(data_path):
        print("Start to load dataset from disk!")
        dataset = load_from_disk(data_path)
    else:
        print("Start to build dataset based on JSON file")
        real_data = get_trained_data(json_path)
        dataset  = _get_dataset(real_data)
        dataset = dataset.map(preprocess_training_examples, batched=True, remove_columns=dataset.column_names)
         # split to train and test dataset
        split_ds = dataset.train_test_split(test_size=.1)
        split_ds.save_to_disk(data_path)
    return dataset


def _get_latest_checkpoint(folder_path):
    """Used to get the latest checkpoint to get the latest trained model.

    Args:
        folder_path (_type_): _description_

    Returns:
        _type_: _description_
    """
    check_point_folder_list = os.listdir(folder_path)
    if len(check_point_folder_list) == 0:
        print("No checkpoint folder get")
        return
    # get the lastest one
    latest_folder = sorted(check_point_folder_list, key=lambda x: x.split('-')[-1], reverse=True)

    return latest_folder[0]


def _dump_json_metric(model_name, info_dict, metric_path='metrics', ):
    # after the full process finsished, then we could loop this folder to get the training info, 
    # and sort the the metrics, then to get the best trained model, use this model to do prediction.
    metric_path = os.path.join(cur_path, metric_path)
    if not os.path.exists(metric_path):
        os.mkdirs(metric_path, exists=True)
    model_path = os.path.join(metric_path, model_name)
    with open(model_path, 'w') as f:
        print("Start to dump json to metric path: {}".format(model_path))
        f.write(json.dumps(info_dict))
        


In [9]:
dataset = get_dataset(json_path='sample.json')


Start to load dataset from disk!


In [7]:
# batch_size 64 is tested will cause 10GB GPU memory
args = TrainingArguments(
    model_id,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    per_device_train_batch_size=64,
    fp16=True,
    push_to_hub=False,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer
)
training_info = trainer.train()

evalution_info = trainer.evluate()
info_dic = {'training': training_info, 'evalute': evalution_info}
# _dump_json_metric(model_name=model_id, info_dict=info_dic)

  0%|          | 0/174 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 21%|██▏       | 37/174 [04:49<17:19,  7.59s/it]

In [7]:
import json
def load_json(file_name):
    with open(file_name, 'r') as f:
        data = json.loads(f.read())
    return data


train_data = load_json('sample.json')

len(train_data)

2

```json
{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}```

In [14]:
# let's create a func to make the real data with 
import copy

train_data = load_json('sample.json')

def get_trained_data(train_data):
  real_train_ds = []
  for i, d in enumerate(train_data['data']):
      ps = d['paragraphs']
      for p in ps:
          context = p['context']
          for qas in p['qas']:
              qas['context'] = context
              qas.pop('is_impossible', '')
              if qas['answers'] == 0:
                  continue
              if len(qas['answers'])  >= 1:
                  # print(qas['answers'])
                  tmp_ans = qas['answers'][0]
                  tmp_ans['text'] = [tmp_ans['text']]
                  tmp_ans['answer_start'] = [tmp_ans['answer_start']]
                  qas['answers'] = tmp_ans
              
              # qas['answers']['text'] = [qas['answers']['text']]     
              real_train_ds.append(qas) 

  real_train_ds = [x for x in real_train_ds if x['answers'] != []]
  return real_train_ds

real_train_ds = get_trained_data(train_data)

len(real_train_ds)

132

In [16]:
import pandas as pd
from datasets import Dataset
# filter with answer will null
# new_ds = [x for x in real_train_ds if x['answers'] != []]

# len(new_ds)

df = pd.DataFrame(real_train_ds)

dataset = Dataset.from_pandas(df)
print(len(dataset))

132


In [10]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

train_ds_new = dataset.map(preprocess_training_examples, batched=True, remove_columns=dataset.column_names)



[{'answers': [{'text': 'DISTRIBUTOR AGREEMENT', 'answer_start': 44}],
  'id': 'LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT__Document Name',
  'question': 'Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract',
  'is_impossible': False},
 {'answers': [{'text': 'Distributor', 'answer_start': 244},
   {'text': 'Electric City Corp.', 'answer_start': 148},
   {'text': 'Electric City of Illinois L.L.C.', 'answer_start': 49574},
   {'text': 'Company', 'answer_start': 197},
   {'text': 'Electric City of Illinois LLC', 'answer_start': 212}],
  'id': 'LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT__Parties',
  'question': 'Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract',
  'is_impossible': False},
 {'answers': [{'text': '7th day of September, 1999.', 'answer_start': 263}],
  'id': 

In [11]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds_new,
    eval_dataset=train_ds_new,
    tokenizer=tokenizer,
)
trainer.train()

82

In [65]:
from torch.utils.data import DataLoader, Dataset


class MyData(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        data = self.data[i]
        return preprocess_training_examples(data)
    
dataset = MyData(real_train_ds)

loader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    


In [36]:
train_data['data'][0]['paragraphs'][0]['qas'][0]

{'answers': [{'text': 'DISTRIBUTOR AGREEMENT', 'answer_start': 44}],
 'id': 'LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT__Document Name',
 'question': 'Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract',
 'is_impossible': False}