In [16]:
import datasets
import os


def load_data_file(path: str):
    """Loads a file containing IGT data into a list of entries."""
    all_data = []

    # If we have a directory, recursively load all files and concat together
    if os.path.isdir(path):
        for file in os.listdir(path):
            if file.endswith(".txt"):
                all_data.extend(load_data_file(os.path.join(path, file)))
        return all_data

    # If we have one file, read in line by line
    with open(path, 'r') as file:
        current_entry = {}

        for line in file:
            # Determine the type of line
            # If we see a type that has already been filled for the current entry, something is wrong
            line_prefix = line[:2]
            if line_prefix == '\\t' and 'transcription' not in current_entry:
                current_entry['transcription'] = line[3:].strip()
            elif line_prefix == '\\m' and 'segmentation' not in current_entry:
                current_entry['segmentation'] = line[3:].strip()
            elif line_prefix == '\\p' and 'pos_glosses' not in current_entry:
                if len(line[3:].strip()) > 0:
                    current_entry['pos_glosses'] = line[3:].strip()
            elif line_prefix == '\\g' and 'glosses' not in current_entry:
                if len(line[3:].strip()) > 0:
                    current_entry['glosses'] = line[3:].strip()
            elif line_prefix == '\\l' and 'translation' not in current_entry:
                current_entry['translation'] = line[3:].strip()
                # Once we have the translation, we've reached the end and can save this entry
                all_data.append(current_entry)
                current_entry = {}
            elif line.strip() != "":
                # Something went wrong
                continue
            else:
                if not current_entry == {}:
                    all_data.append(current_entry)
                    current_entry = {}
        # Might have one extra line at the end
        if not current_entry == {}:
            all_data.append(current_entry)
    return datasets.Dataset.from_list(all_data)


In [27]:
def cleanup_data(dataset: datasets.Dataset):
    df = dataset.to_pandas()

    df['transcription'] = df['transcription'] \
        .str.replace(r"(\w)\?", r"\1 ?", regex=True) \
        .str.replace(r"(\w)\.", r"\1 .", regex=True) \
        .str.replace(r"(\w)\!", r"\1 !", regex=True) \
        .str.replace(r"(\w)\,", r"\1 ,", regex=True) \
        .str.replace("\-(\s|$)", " ", regex=True)

    df['segmentation'] = df['segmentation'] \
        .str.replace(r"(\w)\?", r"\1 ?", regex=True) \
        .str.replace(r"(\w)\.", r"\1 .", regex=True) \
        .str.replace(r"(\w)\!", r"\1 !", regex=True) \
        .str.replace(r"(\w)\,", r"\1 ,", regex=True) \
        .str.replace("\-(\s|$)", " ", regex=True)

    df['glosses'] = df['glosses'] \
        .str.replace("\-(\s|$)", " ", regex=True) \
        .str.replace(r"(\w)\.(\s|$)", r"\1 . ", regex=True) \
        .str.replace(r"(\w)\!(\s|$)", r"\1 ! ", regex=True) \
        .str.replace(r"(\w)\?(\s|$)", r"\1 ? ", regex=True)

    df['pos_glosses'] = df['pos_glosses'] \
        .str.replace("\-(\s|$)", " ", regex=True) \
        .str.replace(r"(\w)\.(\s|$)", r"\1 . ", regex=True) \
        .str.replace(r"(\w)\!(\s|$)", r"\1 ! ", regex=True) \
        .str.replace(r"(\w)\?(\s|$)", r"\1 ? ", regex=True)

    return datasets.Dataset.from_pandas(df)


splits = datasets.DatasetDict({
    'train': cleanup_data(load_data_file('../data/usp-train-track2-uncovered')),
    'eval': cleanup_data(load_data_file('../data/usp-dev-track2-uncovered')),
    'test': cleanup_data(load_data_file('../data/usp-test-track2-uncovered'))
})

splits

DatasetDict({
    train: Dataset({
        features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation'],
        num_rows: 9774
    })
    eval: Dataset({
        features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation'],
        num_rows: 232
    })
    test: Dataset({
        features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation'],
        num_rows: 633
    })
})

In [29]:
splits.push_to_hub('lecslab/usp-igt')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]