In [95]:
# This notebook has been almost entirely generated using ChatGPT (ChatGPT 4o, November 26, 2024)

In [96]:
import json
import random

In [122]:
TRAINING_RATIO = 0.6
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.2

TOTAL_LINES_SMALL = 200
TOTAL_LINES_MEDIUM = 600

In [98]:
# step 1 Load the JSON file
file_path = "data/verses_clauses_dict.json"  # Replace with your JSON file path
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [99]:
# Extract complexities
complexities = [item['complexity'] for item in data.values()]

# Count occurrences of each complexity type
total = len(complexities)
complex_count = complexities.count("complex")
simple_count = complexities.count("simple")

# Calculate percentages
complex_percentage = (complex_count / total) * 100
simple_percentage = (simple_count / total) * 100

In [100]:
# Print results
print(f"Total Entries: {total}")
print(f"Complex: {complex_count} ({complex_percentage:.2f}%)")
print(f"Simple: {simple_count} ({simple_percentage:.2f}%)")


Total Entries: 13084
Complex: 1434 (10.96%)
Simple: 11650 (89.04%)


In [101]:
# step 2: Generate fine-tuning datasets: Separate `complex` and `simple` entries
complex_items = {k: v for k, v in data.items() if v['complexity'] == 'complex'}
simple_items = {k: v for k, v in data.items() if v['complexity'] == 'simple'}

In [102]:
# Step 3: Shuffle keys for randomness
complex_keys = list(complex_items.keys())
simple_keys = list(simple_items.keys())
random.shuffle(complex_keys)
random.shuffle(simple_keys)

In [103]:
# Step 4: Distribute `complex` items across datasets
complex_ratio_training = TRAINING_RATIO
complex_ratio_validation = VALIDATION_RATIO
complex_ratio_test = TEST_RATIO

# Total number of `complex` entries
num_complex = len(complex_keys)

# Calculate how many `complex` items go into each dataset
num_training_complex = int(num_complex * complex_ratio_training)
num_validation_complex = int(num_complex * complex_ratio_validation)
num_test_complex = num_complex - num_training_complex - num_validation_complex  # Remaining for test

# Split `complex` items
complex_training = complex_keys[:num_training_complex]
remaining_complex = complex_keys[num_training_complex:]
complex_validation = remaining_complex[:num_validation_complex]
complex_test = remaining_complex[num_validation_complex:]

In [104]:
# Step 5: Split `simple` items across datasets with adjustable ratios
training_ratio = 0.6  
validation_ratio = 0.20  
test_ratio = 0.20  

# Total number of `simple` entries
num_simple = len(simple_keys)

# Calculate how many `simple` items go into each dataset
num_training_simple = int(num_simple * training_ratio)
num_validation_simple = int(num_simple * validation_ratio)
num_test_simple = num_simple - num_training_simple - num_validation_simple  # Remaining for test

# Split `simple` items
simple_training = simple_keys[:num_training_simple]
remaining_simple = simple_keys[num_training_simple:]
validation_simple = remaining_simple[:num_validation_simple]
test_simple = remaining_simple[num_validation_simple:]

In [105]:
# Step 6: Combine splits to form datasets
training = {**{k: complex_items[k] for k in complex_training},
            **{k: simple_items[k] for k in simple_training}}
validation = {**{k: complex_items[k] for k in complex_validation},
              **{k: simple_items[k] for k in validation_simple}}
test = {**{k: complex_items[k] for k in complex_test},
        **{k: simple_items[k] for k in test_simple}}

In [106]:
# Step 7: Output statistics and save datasets
print(f"Training dataset: {len(training)} entries")
print(f"Validation dataset: {len(validation)} entries")
print(f"Test dataset: {len(test)} entries")

Training dataset: 7850 entries
Validation dataset: 2616 entries
Test dataset: 2618 entries


In [107]:
# verify overlapping between dataset (we want none)
training_keys = set(training.keys())
validation_keys = set(validation.keys())
test_keys = set(test.keys())

print(f"Overlap between training and validation: {training_keys & validation_keys}")
print(f"Overlap between training and test: {training_keys & test_keys}")
print(f"Overlap between validation and test: {validation_keys & test_keys}")

Overlap between training and validation: set()
Overlap between training and test: set()
Overlap between validation and test: set()


In [108]:
# Check distribution of complex and simple elements between datasets
training_complex = sum(1 for v in training.values() if v.get("complexity") == "complex")
validation_complex = sum(1 for v in validation.values() if v.get("complexity") == "complex")
test_complex = sum(1 for v in test.values() if v.get("complexity") == "complex")

print(f"Complex in training: {training_complex}")
print(f"Complex in validation: {validation_complex}")
print(f"Complex in test: {test_complex}")

Complex in training: 860
Complex in validation: 286
Complex in test: 288


In [109]:
# Optional: Save datasets to JSON files with complexity
with open("data/fine_tuning_datasets/training_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(training, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/validation_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(validation, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/test_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(test, f, ensure_ascii=False, indent=4)

In [110]:
# Optional: Save datasets to JSON files WITHOUT complexity

def remove_complexity_field(dataset):
    return {key: {k: v for k, v in value.items() if k != "complexity"} for key, value in dataset.items()}

In [111]:
# Save dataset without complexity

training_no_complexity = remove_complexity_field(training)
validation_no_complexity = remove_complexity_field(validation)
test_no_complexity = remove_complexity_field(test)

with open("data/fine_tuning_datasets/training.json", "w", encoding='utf-8') as f:
    json.dump(training_no_complexity, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/validation.json", "w", encoding='utf-8') as f:
    json.dump(validation_no_complexity, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/test.json", "w", encoding='utf-8') as f:
    json.dump(test_no_complexity, f, ensure_ascii=False, indent=4)

In [120]:
# Checking simple and complex repartitions in the datasets

# step 1 Load the JSON files
file_path = "data/fine_tuning_datasets/training_small_with_complexity.json"  # Replace with your JSON file path
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [121]:
# Extract complexities
complexities = [item['complexity'] for item in data.values()]

# Count occurrences of each complexity type
total = len(complexities)
complex_count = complexities.count("complex")
simple_count = complexities.count("simple")

# Calculate percentages
complex_percentage = (complex_count / total) * 100
simple_percentage = (simple_count / total) * 100

# Print results
print(f"Total Entries: {total}")
print(f"Complex: {complex_count} ({complex_percentage:.2f}%)")
print(f"Simple: {simple_count} ({simple_percentage:.2f}%)")

Total Entries: 200
Complex: 72 (36.00%)
Simple: 128 (64.00%)


#### Dataset 200 elements

In [123]:
# Calculate how many lines for `complex` and `simple` in each small dataset
training_complex_count = int(TOTAL_LINES_SMALL * TRAINING_RATIO * complex_ratio_training)
validation_complex_count = int(TOTAL_LINES_SMALL * VALIDATION_RATIO * complex_ratio_validation)
test_complex_count = int(TOTAL_LINES_SMALL * TEST_RATIO * complex_ratio_test)

training_simple_count = TOTAL_LINES_SMALL - training_complex_count
validation_simple_count = TOTAL_LINES_SMALL - validation_complex_count
test_simple_count = TOTAL_LINES_SMALL - test_complex_count

# Step 1: Shuffle the `complex` and `simple` keys for random selection
random.shuffle(complex_keys)
random.shuffle(simple_keys)

# Step 2: Select `complex` entries for small datasets
training_complex_small = complex_keys[:training_complex_count]
remaining_complex_small = complex_keys[training_complex_count:]
validation_complex_small = remaining_complex_small[:validation_complex_count]
test_complex_small = remaining_complex_small[validation_complex_count:validation_complex_count + test_complex_count]

# Step 3: Select `simple` entries for small datasets
training_simple_small = simple_keys[:training_simple_count]
remaining_simple_small = simple_keys[training_simple_count:]
validation_simple_small = remaining_simple_small[:validation_simple_count]
test_simple_small = remaining_simple_small[validation_simple_count:validation_simple_count + test_simple_count]

# Step 4: Combine `complex` and `simple` entries for small datasets
training_small = {**{k: complex_items[k] for k in training_complex_small},
                  **{k: simple_items[k] for k in training_simple_small}}
validation_small = {**{k: complex_items[k] for k in validation_complex_small},
                    **{k: simple_items[k] for k in validation_simple_small}}
test_small = {**{k: complex_items[k] for k in test_complex_small},
              **{k: simple_items[k] for k in test_simple_small}}

# Step 5: Verify and save the small datasets

training_small_no_complexity = remove_complexity_field(training_small)
validation_small_no_complexity = remove_complexity_field(validation_small)
test_small_no_complexity = remove_complexity_field(test_small)

In [126]:
print(f"Training dataset: {len(training_small)} entries")
print(f"Validation dataset: {len(validation_small)} entries")
print(f"Test dataset: {len(test_small)} entries")

Training dataset: 200 entries
Validation dataset: 200 entries
Test dataset: 200 entries


In [124]:
# Save the small datasets to JSON files (with complexity)
with open("data/fine_tuning_datasets/training_small_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(training_small, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/validation_small_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(validation_small, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/test_small_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(test_small, f, ensure_ascii=False, indent=4)

In [125]:
# Save the small datasets to JSON files
with open("data/fine_tuning_datasets/training_small.json", "w", encoding='utf-8') as f:
    json.dump(training_small_no_complexity, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/validation_small.json", "w", encoding='utf-8') as f:
    json.dump(validation_small_no_complexity, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/test_small.json", "w", encoding='utf-8') as f:
    json.dump(test_small_no_complexity, f, ensure_ascii=False, indent=4)

#### Dataset 600 elements

In [129]:
# Calculate how many lines for `complex` and `simple` in each medium dataset
training_complex_count = int(TOTAL_LINES_MEDIUM * TRAINING_RATIO * complex_ratio_training)
validation_complex_count = int(TOTAL_LINES_MEDIUM * VALIDATION_RATIO * complex_ratio_validation)
test_complex_count = int(TOTAL_LINES_MEDIUM * TEST_RATIO * complex_ratio_test)

training_simple_count = TOTAL_LINES_MEDIUM - training_complex_count
validation_simple_count = TOTAL_LINES_MEDIUM - validation_complex_count
test_simple_count = TOTAL_LINES_MEDIUM - test_complex_count

# Step 1: Shuffle the `complex` and `simple` keys for random selection
random.shuffle(complex_keys)
random.shuffle(simple_keys)

# Step 2: Select `complex` entries for medium datasets
training_complex_medium = complex_keys[:training_complex_count]
remaining_complex_medium = complex_keys[training_complex_count:]
validation_complex_medium = remaining_complex_medium[:validation_complex_count]
test_complex_medium = remaining_complex_medium[validation_complex_count:validation_complex_count + test_complex_count]

# Step 3: Select `simple` entries for medium datasets
training_simple_medium = simple_keys[:training_simple_count]
remaining_simple_medium = simple_keys[training_simple_count:]
validation_simple_medium = remaining_simple_medium[:validation_simple_count]
test_simple_medium = remaining_simple_medium[validation_simple_count:validation_simple_count + test_simple_count]

# Step 4: Combine `complex` and `simple` entries for medium datasets
training_medium = {**{k: complex_items[k] for k in training_complex_medium},
                  **{k: simple_items[k] for k in training_simple_medium}}
validation_medium = {**{k: complex_items[k] for k in validation_complex_medium},
                    **{k: simple_items[k] for k in validation_simple_medium}}
test_medium = {**{k: complex_items[k] for k in test_complex_medium},
              **{k: simple_items[k] for k in test_simple_medium}}

# Step 5: Verify and save the medium datasets

training_medium_no_complexity = remove_complexity_field(training_medium)
validation_medium_no_complexity = remove_complexity_field(validation_medium)
test_medium_no_complexity = remove_complexity_field(test_medium)

In [130]:
print(f"Training dataset: {len(training_medium)} entries")
print(f"Validation dataset: {len(validation_medium)} entries")
print(f"Test dataset: {len(test_medium)} entries")

Training dataset: 600 entries
Validation dataset: 600 entries
Test dataset: 600 entries


In [131]:
# Save the medium datasets to JSON files (with complexity)
with open("data/fine_tuning_datasets/training_medium_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(training_medium, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/validation_medium_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(validation_medium, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/test_medium_with_complexity.json", "w", encoding='utf-8') as f:
    json.dump(test_medium, f, ensure_ascii=False, indent=4)

In [132]:
# Save the medium datasets to JSON files
with open("data/fine_tuning_datasets/training_medium.json", "w", encoding='utf-8') as f:
    json.dump(training_medium_no_complexity, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/validation_medium.json", "w", encoding='utf-8') as f:
    json.dump(validation_medium_no_complexity, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/test_medium.json", "w", encoding='utf-8') as f:
    json.dump(test_medium_no_complexity, f, ensure_ascii=False, indent=4)