# Create a new Grobid "flavour" for law and the humanities


## Preparation

Install and start the Grobid docker container. For a local server:

```bash
docker pull lfoppiano/grobid:latest-dh-law
docker run -d --rm  -p 8070:8070 lfoppiano/grobid:latest-dh-law
```

Then rename `.env.dist` to `.env` and adapt the `GROBID_SERVER_URL` environment variable to point to the Grobid server's HTTP endpoint.


## Create a training set

See
- https://grobid.readthedocs.io/en/latest/Training-the-models-of-Grobid/
- https://grobid.readthedocs.io/en/latest/Grobid-specialized-processes/


In [2]:
import requests
import zipfile
import os
import glob
from tqdm import tqdm
from dotenv import load_dotenv

# get URL from local .env file
load_dotenv()
GROBID_SERVER_URL = os.environ.get('GROBID_SERVER_URL')

def create_training(input_file_path, output_dir_path):
    url = f'{GROBID_SERVER_URL}/api/createTraining'
    files = {
        'input': open(input_file_path, 'rb'),
        'flavor': 'article/dh-law-footnotes'
    }

    response = requests.post(url, files=files)

    output_file_path = os.path.join(output_dir_path, 'file.zip')
    with open(output_file_path, 'wb') as f:
        f.write(response.content)

    with zipfile.ZipFile(output_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir_path)

    os.remove(output_file_path)

pdf_files = glob.glob('../batches/batch_1/0_input/*.pdf')
for pdf_file in tqdm(pdf_files):
    output_dir = os.path.join('../batches/batch_1/1_generated', os.path.splitext(os.path.basename(pdf_file))[0])
    os.makedirs(output_dir, exist_ok=True)
    create_training(pdf_file, output_dir)

100%|██████████| 8/8 [00:32<00:00,  4.03s/it]
