In [1]:
from pathlib import Path
from math import floor

In [2]:
# Define paths directly
data_path = Path("data", "all_data")
vocab_output_path = data_path.parent / "vocab.txt"

In [3]:
# Set the dataset splits percentages
TRAIN_PER = 0.6
TEST_PER = 0.2
VAL_PER = 0.2

# Initialize a dictionary to count file occurrences
occurrences_count = {}

In [4]:
# Iterate through files in the data_path
for file in data_path.iterdir():
    stem = file.stem
    suffix = file.suffix
    # Initialize the count dictionary as needed
    occurrences_count.setdefault(stem, {}).setdefault(suffix, 0)
    occurrences_count[stem][suffix] += 1

In [5]:
# Filter for valid pairs of .gui and .png files
valid_pairs = []
for key, counts in occurrences_count.items():
    # Check if exactly one .gui and one .png file exist for the key
    has_one_gui = counts.get(".gui", 0) == 1
    has_one_png = counts.get(".png", 0) == 1
    if has_one_gui and has_one_png:
        valid_pairs.append(key)

In [6]:
# Calculate the number of valid examples and compute split indices
number_of_examples = len(valid_pairs)
train_split = floor(number_of_examples * TRAIN_PER)
validation_split = floor(number_of_examples * VAL_PER)
test_split = floor(number_of_examples * TEST_PER)

# Create datasets
train_set = valid_pairs[:train_split]
validation_set = valid_pairs[train_split:train_split + validation_split]
test_set = valid_pairs[train_split + validation_split:]


In [7]:
# Save dataset splits to files
dataset_splits = {"train": train_set, "validation": validation_set, "test": test_set}
for key, value in dataset_splits.items():
    filepath = data_path.parent / f'{key}_dataset.txt'
    with open(filepath, "w") as writer:
        for example in value:
            writer.write(example + "\n")

In [8]:
# Initialize a dictionary to keep unique tokens
all_tokens = set()  # Use a set for unique tokens

# Iterate through .gui files to extract tokens
for file in data_path.glob("*.gui"):
    with open(file, "r") as reader:
        # Normalize space around commas and split by spaces
        data = reader.read().replace('\n', ' ').replace(', ', ' , ').split()
        all_tokens.update(data)  # Update set with new tokens

# Write the set of all tokens to a vocab file
with open(vocab_output_path, "w") as writer:
    writer.write(" ".join(sorted(all_tokens)))  # Sort tokens for consistent ordering

print(f'Found a total of {number_of_examples} valid examples')
print(f'Writing vocab with {len(all_tokens)} tokens to {vocab_output_path}')

Found a total of 1699 valid examples
Writing vocab with 13 tokens to data\vocab.txt
