In [4]:
import pandas as pd
from datasets import DatasetDict, Dataset
import json

In [5]:
def parse_file(file_name):
    with open(file_name) as f:
        data = json.load(f)
    documents = data['documents']
    labels = data['labels']
    rows = []
    for doc in documents:
        doc_id = doc['id']
        text = doc['text']
        for annotation_key in doc['annotation_sets'][0]['annotations']:
            hyp_key = annotation_key
            label = doc['annotation_sets'][0]['annotations'][hyp_key]['choice']
            spans = doc['annotation_sets'][0]['annotations'][hyp_key]['spans']
            hyp = labels[hyp_key]['hypothesis']
            rows.append([doc_id, text, hyp, label, spans])
    df = pd.DataFrame(rows, columns=['doc_id', 'sentence1', 'sentence2', 'label', 'spans'])
    return df

In [6]:
prefix = '../data/contract-nli/{split}.json'
dataset = DatasetDict()
for split in ['train', 'test', 'dev']:
    path = prefix.format(split=split)
    dataset[split]= Dataset.from_dict(parse_file(path))
dataset = dataset.remove_columns(
    ['doc_id', 'spans']
)
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 7191
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2091
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 1037
    })
})

In [7]:
dataset = dataset.rename_column("label", "gold_label")

In [9]:
dataset.push_to_hub("presencesw/contract-nli", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/551 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/presencesw/contract-nli/commit/844542cec871b8a496ad622471565a98acc72f48', commit_message='Upload dataset', commit_description='', oid='844542cec871b8a496ad622471565a98acc72f48', pr_url=None, pr_revision=None, pr_num=None)