In [None]:
from src.common import config

## 1. Scraping

In [None]:
import src.scraping as scr

In [None]:
_ = await scr.scrape_ids()

In [None]:
await scr.download_docs()

In [None]:
_ = scr.extract_text()

In [None]:
_ = scr.parse_docs()

## 2. Labeling

In [None]:
from src.labeling import label_docs

In [None]:
_ = await label_docs()

## 3. Augmentation

In [None]:
from src.augmentation import create_augmentations

In [None]:
_ = await create_augmentations()

## 4. Train and Test Sets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [None]:
df = pd.read_json(config.DOCS_AUGMENTED_JSONL, lines=True).sort_values(by="id")

In [None]:
train_unbalanced, test = train_test_split(
    df,
    test_size=1/3,
    stratify=df.decision,
    random_state=42,
    shuffle=True
)

In [None]:
n = train_unbalanced.decision.value_counts().min()
train = (
    train_unbalanced.groupby("decision")
    .sample(n=n, random_state=42)
    .sample(frac=1, random_state=42)
)

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
dataset.save_to_disk(
    config.DATA_DIR / "BGH-CivAppeals-GenderCF"
)