# MIT Restaurant Corpus

The MIT Restaurant Corpus is a semantically tagged training and test corpus in BIO format.

- https://groups.csail.mit.edu/sls/downloads/

In [1]:
from pathlib import Path

data_dir = Path('../data')

hub_username = 'marcov'

corpus_name = 'NER_ENGLISH_RESTAURANT'
corpus_url = 'https://groups.csail.mit.edu/sls/downloads/restaurant'
corpus_dir = data_dir / corpus_name
corpus_dir.mkdir(parents=True, exist_ok=True)

train_filename = 'restauranttrain.bio'
test_filename = 'restauranttest.bio'

In [2]:
!wget -nc {corpus_url}/{train_filename} -P {corpus_dir}
!wget -nc {corpus_url}/{test_filename} -P {corpus_dir}

--2024-04-28 12:17:38--  https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttrain.bio
Resolving groups.csail.mit.edu (groups.csail.mit.edu)... 128.30.2.44
Connecting to groups.csail.mit.edu (groups.csail.mit.edu)|128.30.2.44|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 759757 (742K)
Saving to: ‘../data/NER_ENGLISH_RESTAURANT/restauranttrain.bio’


2024-04-28 12:17:40 (442 KB/s) - ‘../data/NER_ENGLISH_RESTAURANT/restauranttrain.bio’ saved [759757/759757]

--2024-04-28 12:17:40--  https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttest.bio
Resolving groups.csail.mit.edu (groups.csail.mit.edu)... 128.30.2.44
Connecting to groups.csail.mit.edu (groups.csail.mit.edu)|128.30.2.44|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 155722 (152K)
Saving to: ‘../data/NER_ENGLISH_RESTAURANT/restauranttest.bio’


2024-04-28 12:17:41 (393 KB/s) - ‘../data/NER_ENGLISH_RESTAURANT/restauranttest.bio’ saved [155722/155722]



In [3]:
from ai_den.utils.datasets import read_conll_file
from datasets import DatasetDict

column_format = {'text': 1, 'ner': 0}

ds = DatasetDict({
    'train': read_conll_file(corpus_dir / train_filename, column_format),
    'test': read_conll_file(corpus_dir / test_filename, column_format),
})

ds

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'ner'],
        num_rows: 7660
    })
    test: Dataset({
        features: ['text', 'ner'],
        num_rows: 1521
    })
})

In [4]:
ds['train'].to_pandas()

Unnamed: 0,text,ner
0,"[2, start, restaurants, with, inside, dining]","[B-Rating, I-Rating, O, O, B-Amenity, I-Amenity]"
1,[34],[O]
2,"[5, star, resturants, in, my, town]","[B-Rating, I-Rating, O, B-Location, I-Location..."
3,"[98, hong, kong, restaurant, reasonable, prices]","[O, B-Restaurant_Name, I-Restaurant_Name, O, B..."
4,"[a, great, lunch, spot, but, open, till, 2, a,...","[O, O, O, O, O, B-Hours, I-Hours, I-Hours, I-H..."
...,...,...
7655,"[yes, please, locate, the, nearest, seafood, r...","[O, O, O, O, B-Location, B-Cuisine, O]"
7656,"[yes, we, are, looking, for, a, formal, restau...","[O, O, O, O, O, O, B-Amenity, O, O, O, O, O, O..."
7657,"[yes, we, need, a, to, stop, at, five, guys, f...","[O, O, O, O, O, O, O, B-Restaurant_Name, I-Res..."
7658,"[yes, we, need, to, find, a, cheap, deli, with...","[O, O, O, O, O, O, B-Price, B-Cuisine, O, B-Ra..."


In [5]:
ds.push_to_hub(
    repo_id=f'{hub_username}/{corpus_name}',
    private=True,
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/marcov/NER_ENGLISH_RESTAURANT/commit/d71bb5c95ac4e10fa596f69bd7f464499739c537', commit_message='Upload dataset', commit_description='', oid='d71bb5c95ac4e10fa596f69bd7f464499739c537', pr_url=None, pr_revision=None, pr_num=None)