# WNUT 2020: Wet Lab Protocol

- https://aclanthology.org/2020.wnut-1.33/
- https://github.com/jeniyat/WNUT_2020_NER

In [1]:
from pathlib import Path

data_dir = Path('../data')

hub_username = 'marcov'

corpus_name = 'NER_ENGLISH_WNUT_2020'
corpus_url = 'https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip'
corpus_dir = data_dir / corpus_name
corpus_dir.mkdir(parents=True, exist_ok=True)

In [2]:
!wget -nc {corpus_url} -P {corpus_dir}

--2024-04-28 12:12:23--  https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/jeniyat/WNUT_2020_NER/zip/refs/heads/master [following]
--2024-04-28 12:12:23--  https://codeload.github.com/jeniyat/WNUT_2020_NER/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.112.10
Connecting to codeload.github.com (codeload.github.com)|140.82.112.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘../data/NER_ENGLISH_WNUT_2020/master.zip’

master.zip              [               <=>  ]   6.69M  2.00MB/s    in 3.3s    

2024-04-28 12:12:27 (2.00 MB/s) - ‘../data/NER_ENGLISH_WNUT_2020/master.zip’ saved [7015905]



In [6]:
from zipfile import ZipFile

with ZipFile(corpus_dir / 'master.zip') as z:
    z.extractall(corpus_dir)

In [7]:
from ai_den.utils.datasets import read_conll_directory
from datasets import DatasetDict

column_format = {'text': 1, 'ner': 0}

ds = DatasetDict({
    'train': read_conll_directory(
        path=corpus_dir,
        glob='**/data/train_data/Conll_Format/*.txt',
        columns=dict(text=0, ner=1),
        filename_field='file',
    ),
    'validation': read_conll_directory(
        path=corpus_dir,
        glob='**/data/dev_data/Conll_Format/*.txt',
        columns=dict(text=0, ner=1),
        filename_field='file',
    ),
    'test': read_conll_directory(
        path=corpus_dir,
        glob='**/data/test_data/Conll_Format/*.txt',
        columns=dict(text=0, ner=1),
        filename_field='file',
    ),
    'test_2020': read_conll_directory(
        path=corpus_dir,
        glob='**/data/test_data_2020/Conll_Format/*.txt',
        columns=dict(text=0, ner=1),
        filename_field='file',
    ),
})

ds

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'text', 'ner'],
        num_rows: 8444
    })
    validation: Dataset({
        features: ['file', 'text', 'ner'],
        num_rows: 2862
    })
    test: Dataset({
        features: ['file', 'text', 'ner'],
        num_rows: 2813
    })
    test_2020: Dataset({
        features: ['file', 'text', 'ner'],
        num_rows: 3562
    })
})

In [8]:
ds['train'].to_pandas()

Unnamed: 0,file,text,ner
0,protocol_3_conll.txt,"[Standard, RNA, Synthesis, (, E2050, )]","[O, B-Reagent, B-Method, O, O, O]"
1,protocol_3_conll.txt,"[Thaw, the, necessary, kit, components, .]","[B-Action, O, B-Modifier, B-Reagent, I-Reagent..."
2,protocol_3_conll.txt,"[Mix, and, pulse-spin, in, microfuge, to, coll...","[B-Action, O, B-Action, O, B-Device, O, B-Acti..."
3,protocol_3_conll.txt,"[Keep, on, ice, .]","[B-Action, O, B-Location, O]"
4,protocol_3_conll.txt,"[Assemble, the, reaction, at, room, temperatur...","[B-Action, O, B-Reagent, O, B-Temperature, I-T..."
...,...,...,...
8439,protocol_622_conll.txt,"[Mix, the, following, components, in, a, steri...","[B-Action, O, B-Modifier, B-Reagent, O, O, B-M..."
8440,protocol_622_conll.txt,"[Reactions, containing, restriction, enzyme, d...","[B-Reagent, O, B-Reagent, I-Reagent, I-Reagent..."
8441,protocol_622_conll.txt,"[Reactions, with, sheared/nebulized, DNA, or, ...","[B-Reagent, O, B-Reagent, I-Reagent, O, B-Reag..."
8442,protocol_622_conll.txt,"[Immediately, inactivate, enzyme, in, the, blu...","[B-Modifier, B-Action, B-Reagent, O, O, B-Meth..."


In [9]:
ds.push_to_hub(
    repo_id=f'{hub_username}/{corpus_name}',
    private=True,
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/marcov/NER_ENGLISH_WNUT_2020/commit/26842a879f13aed91a975b08e12ab3c5157f954b', commit_message='Upload dataset', commit_description='', oid='26842a879f13aed91a975b08e12ab3c5157f954b', pr_url=None, pr_revision=None, pr_num=None)