In [1]:
# This notebook has been almost entirely generated using ChatGPT (ChatGPT 4o, November 26, 2024), and the help of Olivier Lauzanne

In [2]:
import json
import random

In [3]:
# The file containing these datasets is trial_1 (seed(0))

random.seed(0)

In [4]:
TRAINING_RATIO = 0.6
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.2

TOTAL_LINES_SMALL = 200
TOTAL_LINES_MEDIUM = 1000

In [5]:
example_simple = """User input: "חזון ישעיהו בןאמוץ אשר חזה עליהודה וירושלם בימי עזיהו יותם אחז יחזקיהו מלכי יהודה"
Expected output: ["חזון ישעיהו בןאמוץ","אשר חזה עליהודה וירושלם בימי עזיהו יותם אחז יחזקיהו מלכי יהודה"]"""

example_complex = """User input: "יביא יהוה עליך ועלעמך ועלבית אביך ימים אשר לאבאו למיום סוראפרים מעל יהודה את מלך אשור פ"
Expected output: ["יביא יהוה עליך ועלעמך ועלבית אביך ימים את מלך אשור פ" ,"אשר לאבאו למיום" ,"סוראפרים מעל יהודה"]"""


SYSTEM_PROMPT = f"Identify clauses in this Biblical Hebrew verse and return a JSON list containing the clauses, as shown in the following examples. Do not write anything else than the JSON list in your output. Example 1 {example_simple} Example 2 {example_complex}"

In [6]:
# Function to save a dataset as JSONL

def save_as_jsonl(filename, dataset):
    """Convert a JSON file into JSONL format."""
    with open(filename, "w", encoding="utf-8") as f:
        for verse, record in dataset.items():
            # Include the verse as part of the JSON object
            record_with_verse = {"verse": verse, **record}
            f.write(json.dumps(record_with_verse, ensure_ascii=False) + "\n")
            

def remove_complexity_field(dataset):
    """Remove the complexity field from a json file."""
    return {key: {k: v for k, v in value.items() if k != "complexity"} for key, value in dataset.items()}


def prepare_finetuning_data(system_prompt, dataset, output_json_file, output_jsonl_file):
    """
    Converts a dataset into the format expected for fine-tuning and saves it to JSON and JSONL files.

    Parameters:
        dataset (dict): The dataset with verses as keys and clauses as values.
        output_json_file (str): Path to save the output JSON file.
        output_jsonl_file (str): Path to save the output JSONL file.
    """
    # System message content
    system_message = {
        "role": "system",
        "content": SYSTEM_PROMPT
    }

    # Create the formatted dataset
    formatted_data = []
    for verse, content in dataset.items():
        # Add system message
        messages = [system_message]

        # Add user message
        user_message = {
            "role": "user",
            "content": f"Parse this verse: {verse}"
        }
        messages.append(user_message)

        # Add assistant message
        assistant_message = {
            "role": "assistant",
            "content": json.dumps(content["clauses"], ensure_ascii=False)
        }
        messages.append(assistant_message)

        # Add to formatted data
        formatted_data.append({"messages": messages})

    # Randomise the order of the lines in formatted data
    random.shuffle(formatted_data)
    
    # Save as JSON
    with open(output_json_file, "w", encoding="utf-8") as f:
        json.dump(formatted_data, f, ensure_ascii=False, indent=4)

    # Save as JSONL
    with open(output_jsonl_file, "w", encoding="utf-8") as f:
        for entry in formatted_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [7]:
# step 1 Load the JSON file
file_path = "data/verses_clauses_dict.json"  # Replace with your JSON file path
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [8]:
# Extract complexities
complexities = [item['complexity'] for item in data.values()]

# Count occurrences of each complexity type
total = len(complexities)
complex_count = complexities.count("complex")
simple_count = complexities.count("simple")

# Complex / simple ratios
complex_ratio = complex_count / total
simple_ratio = simple_count / total

# Calculate percentages
complex_percentage = (complex_count / total) * 100
simple_percentage = (simple_count / total) * 100

In [9]:
# Print results
print(f"Total Entries: {total}")
print(f"Complex: {complex_count} ({complex_percentage:.2f}%)")
print(f"Simple: {simple_count} ({simple_percentage:.2f}%)")

Total Entries: 13084
Complex: 1434 (10.96%)
Simple: 11650 (89.04%)


In [11]:
# step 2: Generate fine-tuning simple datasets: only keep simple entries
complex_items = {k: v for k, v in data.items() if v['complexity'] == 'complex'}
simple_items = {k: v for k, v in data.items() if v['complexity'] == 'simple'}

In [12]:
# Step 3: Shuffle keys for randomness
complex_keys = list(complex_items.keys())
simple_keys = list(simple_items.keys())
random.shuffle(complex_keys)
random.shuffle(simple_keys)

In [13]:
# Step 4: Distribute `complex` items across datasets
complex_ratio_training = TRAINING_RATIO
complex_ratio_validation = VALIDATION_RATIO
complex_ratio_test = TEST_RATIO

# Total number of `complex` entries
num_complex = len(complex_keys)

# Calculate how many `complex` items go into each dataset
num_training_complex = int(num_complex * complex_ratio_training)
num_validation_complex = int(num_complex * complex_ratio_validation)
num_test_complex = num_complex - num_training_complex - num_validation_complex  # Remaining for test

# Split `complex` items
complex_training = complex_keys[:num_training_complex]
remaining_complex = complex_keys[num_training_complex:]
complex_validation = remaining_complex[:num_validation_complex]
complex_test = remaining_complex[num_validation_complex:]

In [37]:
# Step 5: Split `simple` items across datasets with adjustable ratios
training_ratio = TRAINING_RATIO 
validation_ratio = VALIDATION_RATIO  
test_ratio = TEST_RATIO 

# Total number of `simple` entries
num_simple = len(simple_keys)

# Calculate how many `simple` items go into each dataset
num_training_simple = int(num_simple * training_ratio)
num_validation_simple = int(num_simple * validation_ratio)
num_test_simple = num_simple - num_training_simple - num_validation_simple  # Remaining for test

# Split `simple` items
simple_training = simple_keys[:num_training_simple]
remaining_simple = simple_keys[num_training_simple:]
validation_simple = remaining_simple[:num_validation_simple]
test_simple = remaining_simple[num_validation_simple:]

In [14]:
# Step 6: Combine splits to form datasets
training = {**{k: complex_items[k] for k in complex_training}}
validation = {**{k: complex_items[k] for k in complex_validation}}
test = {**{k: complex_items[k] for k in complex_test}}

In [15]:
# Step 7: Output statistics and save datasets
print(f"Training dataset: {len(training)} entries")
print(f"Validation dataset: {len(validation)} entries")
print(f"Test dataset: {len(test)} entries")

Training dataset: 860 entries
Validation dataset: 286 entries
Test dataset: 288 entries


In [16]:
# step 2: Generate fine-tuning datasets: Separate `complex` and `simple` entries
complex_training_items = {k: v for k, v in training.items() if v['complexity'] == 'complex'}
simple_training_items = {k: v for k, v in training.items() if v['complexity'] == 'simple'}

# Step 3: Shuffle keys for randomness
complex_training_keys = list(complex_training_items.keys())
simple_training_keys = list(simple_training_items.keys())
random.shuffle(complex_training_keys)
random.shuffle(simple_training_keys)

In [17]:
# step 2: Generate fine-tuning datasets: Separate `complex` and `simple` entries
complex_validation_items = {k: v for k, v in validation.items() if v['complexity'] == 'complex'}
simple_validation_items = {k: v for k, v in validation.items() if v['complexity'] == 'simple'}

# Step 3: Shuffle keys for randomness
complex_validation_keys = list(complex_validation_items.keys())
simple_validation_keys = list(simple_validation_items.keys())
random.shuffle(complex_validation_keys)
random.shuffle(simple_validation_keys)

In [18]:
# step 2: Generate fine-tuning datasets: Separate `complex` and `simple` entries
complex_test_items = {k: v for k, v in test.items() if v['complexity'] == 'complex'}
simple_test_items = {k: v for k, v in test.items() if v['complexity'] == 'simple'}

# Step 3: Shuffle keys for randomness
complex_test_keys = list(complex_test_items.keys())
simple_test_keys = list(simple_test_items.keys())
random.shuffle(complex_test_keys)
random.shuffle(simple_test_keys)

In [19]:
# Check distribution of complex and simple elements between datasets
training_complex = sum(1 for v in training.values() if v.get("complexity") == "complex")
validation_complex = sum(1 for v in validation.values() if v.get("complexity") == "complex")
test_complex = sum(1 for v in test.values() if v.get("complexity") == "complex")

print(f"Complex in training: {training_complex}")
print(f"Complex in validation: {validation_complex}")
print(f"Complex in test: {test_complex}")

Complex in training: 860
Complex in validation: 286
Complex in test: 288


In [22]:
# Optional: Save datasets to JSON files with complexity
with open("data/fine_tuning_datasets/trial_4/training_with_complexity_complex.json", "w", encoding='utf-8') as f:
    json.dump(training, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/trial_4/validation_with_complexity_complex.json", "w", encoding='utf-8') as f:
    json.dump(validation, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/trial_4/test_with_complexity_complex.json", "w", encoding='utf-8') as f:
    json.dump(test, f, ensure_ascii=False, indent=4)

In [23]:
# Remove the complexity field
# Save dataset without complexity

training_no_complexity = remove_complexity_field(training)
validation_no_complexity = remove_complexity_field(validation)
test_no_complexity = remove_complexity_field(test)

In [24]:
print(f"Training dataset: {len(training_no_complexity)} entries")
print(f"Validation dataset: {len(validation_no_complexity)} entries")
print(f"Test dataset: {len(test_no_complexity)} entries")

Training dataset: 860 entries
Validation dataset: 286 entries
Test dataset: 288 entries


In [25]:
# Save to json and jsonl files

# training_large
prepare_finetuning_data(SYSTEM_PROMPT, training_no_complexity, "data/fine_tuning_datasets/trial_4/training_complex_large.json", "data/fine_tuning_datasets/trial_4/training_complex_large.jsonl")

# validation_large
prepare_finetuning_data(SYSTEM_PROMPT, validation_no_complexity, "data/fine_tuning_datasets/trial_4/validation_complex_large.json", "data/fine_tuning_datasets/trial_4/validation_complex_large.jsonl")

# test_large
prepare_finetuning_data(SYSTEM_PROMPT, test_no_complexity, "data/fine_tuning_datasets/trial_4/test_complex_large.json", "data/fine_tuning_datasets/trial_4/test_complex_large.jsonl")

#### Dataset 200 elements

In [28]:
# Calculate how many lines for `complex` and `simple` in each small dataset
training_complex_count = int(TOTAL_LINES_SMALL * 100)
validation_complex_count = int(TOTAL_LINES_SMALL * 100)
test_complex_count = int(TOTAL_LINES_SMALL * 100)

training_simple_count = TOTAL_LINES_SMALL - training_complex_count
validation_simple_count = TOTAL_LINES_SMALL - validation_complex_count
test_simple_count = TOTAL_LINES_SMALL - test_complex_count

# Step 2: Select `complex` entries for small datasets
training_complex_small = complex_training_keys[:training_complex_count]
validation_complex_small = complex_validation_keys[:validation_complex_count]
test_complex_small = complex_test_keys[:test_complex_count]

# Step 3: Select `simple` entries for small datasets
training_simple_small = simple_training_keys[:training_simple_count]
validation_simple_small = simple_validation_keys[:validation_simple_count]
test_simple_small = simple_test_keys[:test_simple_count]

# Step 4: Combine `complex` and `simple` entries for small datasets
training_small = {**{k: complex_items[k] for k in training_complex_small}}
validation_small = {**{k: complex_items[k] for k in validation_complex_small}}
test_small = {**{k: complex_items[k] for k in test_complex_small}}

# Step 5: Verify and save the small datasets

training_small_no_complexity = remove_complexity_field(training_small)
validation_small_no_complexity = remove_complexity_field(validation_small)
test_small_no_complexity = remove_complexity_field(test_small)

In [29]:
print(f"Training dataset: {len(training_small)} entries")
print(f"Validation dataset: {len(validation_small)} entries")
print(f"Test dataset: {len(test_small)} entries")

Training dataset: 860 entries
Validation dataset: 286 entries
Test dataset: 288 entries


In [50]:
# Save the small datasets to JSON files (with complexity)
with open("data/fine_tuning_datasets/trial_2/training_small_with_complexity_simple.json", "w", encoding='utf-8') as f:
    json.dump(training_small, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/trial_2/validation_small_with_complexity_simple.json", "w", encoding='utf-8') as f:
    json.dump(validation_small, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/trial_2/test_small_with_complexity_simple.json", "w", encoding='utf-8') as f:
    json.dump(test_small, f, ensure_ascii=False, indent=4)

In [51]:
# Save to json and jsonl files for fine-tuning

# training_large
prepare_finetuning_data(SYSTEM_PROMPT, training_small_no_complexity, "data/fine_tuning_datasets/trial_2/training_simple_small.json", "data/fine_tuning_datasets/trial_2/training_simple_small.jsonl")

# validation_large
prepare_finetuning_data(SYSTEM_PROMPT, validation_small_no_complexity, "data/fine_tuning_datasets/trial_2/validation_simple_small.json", "data/fine_tuning_datasets/trial_2/validation_simple_small.jsonl")

# test_large
prepare_finetuning_data(SYSTEM_PROMPT, test_small_no_complexity, "data/fine_tuning_datasets/trial_2/test_simple_small.json", "data/fine_tuning_datasets/trial_2/test_simple_small.jsonl")

#### Dataset 1000 elements

In [30]:
# Calculate how many lines for `complex` and `simple` in each medium dataset
training_complex_count = int(TOTAL_LINES_MEDIUM * 100)
validation_complex_count = int(TOTAL_LINES_MEDIUM * 100)
test_complex_count = int(TOTAL_LINES_MEDIUM * 100)

training_simple_count = TOTAL_LINES_MEDIUM - training_complex_count
validation_simple_count = TOTAL_LINES_MEDIUM - validation_complex_count
test_simple_count = TOTAL_LINES_MEDIUM - test_complex_count


# Step 2: Select `complex` entries for medium datasets
training_complex_medium = complex_training_keys[:training_complex_count]
validation_complex_medium = complex_validation_keys[:validation_complex_count]
test_complex_medium = complex_test_keys[:test_complex_count]

# Step 3: Select `simple` entries for medium datasets
training_simple_medium = simple_training_keys[:training_simple_count]
validation_simple_medium = simple_validation_keys[:validation_simple_count]
test_simple_medium = simple_test_keys[:test_simple_count]

# Step 4: Combine `complex` and `simple` entries for medium datasets
training_medium = {**{k: complex_items[k] for k in training_complex_medium},
                  **{k: simple_items[k] for k in training_simple_medium}}
validation_medium = {**{k: complex_items[k] for k in validation_complex_medium},
                    **{k: simple_items[k] for k in validation_simple_medium}}
test_medium = {**{k: complex_items[k] for k in test_complex_medium},
              **{k: simple_items[k] for k in test_simple_medium}}

# Step 5: Verify and save the medium datasets

training_medium_no_complexity = remove_complexity_field(training_medium)
validation_medium_no_complexity = remove_complexity_field(validation_medium)
test_medium_no_complexity = remove_complexity_field(test_medium)

In [31]:
print(f"Training dataset: {len(training_medium)} entries")
print(f"Validation dataset: {len(validation_medium)} entries")
print(f"Test dataset: {len(test_medium)} entries")

Training dataset: 860 entries
Validation dataset: 286 entries
Test dataset: 288 entries


In [54]:
# Save the medium datasets to JSON files (with complexity)
with open("data/fine_tuning_datasets/trial_2/training_medium_with_complexity_simple.json", "w", encoding='utf-8') as f:
    json.dump(training_medium, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/trial_2/validation_medium_with_complexity_simple.json", "w", encoding='utf-8') as f:
    json.dump(validation_medium, f, ensure_ascii=False, indent=4)
with open("data/fine_tuning_datasets/trial_2/test_medium_with_complexity_simple.json", "w", encoding='utf-8') as f:
    json.dump(test_medium, f, ensure_ascii=False, indent=4)

In [55]:
# Save to json and jsonl files for fine-tuning

# training_large
prepare_finetuning_data(SYSTEM_PROMPT, training_medium_no_complexity, "data/fine_tuning_datasets/trial_2/training_simple_medium.json", "data/fine_tuning_datasets/trial_2/training_simple_medium.jsonl")

# validation_large
prepare_finetuning_data(SYSTEM_PROMPT, validation_medium_no_complexity, "data/fine_tuning_datasets/trial_2/validation_simple_medium.json", "data/fine_tuning_datasets/trial_2/validation_simple_medium.jsonl")

# test_large
prepare_finetuning_data(SYSTEM_PROMPT, test_medium_no_complexity, "data/fine_tuning_datasets/trial_2/test_simple_medium.json", "data/fine_tuning_datasets/trial_2/test_simple_medium.jsonl")

In [56]:
# Check intersections between variables

datasets = {
    "training_small": training_small,
    "training_medium": training_medium,
    "training": training,
    "validation_small": validation_small,
    "validation_medium": validation_medium,
    "validation": validation,
    "test_small": test_small,
    "test_medium": test_medium,
    "test": test,
               }

In [57]:
for name1, dataset1 in datasets.items():
    for name2, dataset2 in datasets.items():
        if name1 == name2:
            continue
        keys1 = set(dataset1.keys())
        keys2 = set(dataset2.keys())
        intersection = len(keys1 & keys2)
        if intersection != 0:
            print(f"Overlap between {name1} and {name2}: {intersection}")

Overlap between training_small and training_medium: 200
Overlap between training_small and training: 200
Overlap between training_medium and training_small: 200
Overlap between training_medium and training: 1000
Overlap between training and training_small: 200
Overlap between training and training_medium: 1000
Overlap between validation_small and validation_medium: 200
Overlap between validation_small and validation: 200
Overlap between validation_medium and validation_small: 200
Overlap between validation_medium and validation: 1000
Overlap between validation and validation_small: 200
Overlap between validation and validation_medium: 1000
Overlap between test_small and test_medium: 200
Overlap between test_small and test: 200
Overlap between test_medium and test_small: 200
Overlap between test_medium and test: 1000
Overlap between test and test_small: 200
Overlap between test and test_medium: 1000


In [227]:
# Check the amount of complex and simple in each dataset

In [58]:
for name, dataset in datasets.items():
    complexities = [item['complexity'] for item in dataset.values()]

    # Count occurrences of each complexity type
    total = len(complexities)
    complex_count = complexities.count("complex")
    simple_count = complexities.count("simple")
    
    # Calculate percentages
    complex_percentage = (complex_count / total) * 100
    simple_percentage = (simple_count / total) * 100
    
    # Print results
    print(f"{name}")
    print(f"Total Entries: {total}")
    print(f"Complex: {complex_count} ({complex_percentage:.2f}%)")
    print(f"Simple: {simple_count} ({simple_percentage:.2f}%)")
    print("")

training_small
Total Entries: 200
Complex: 0 (0.00%)
Simple: 200 (100.00%)

training_medium
Total Entries: 1000
Complex: 0 (0.00%)
Simple: 1000 (100.00%)

training
Total Entries: 6990
Complex: 0 (0.00%)
Simple: 6990 (100.00%)

validation_small
Total Entries: 200
Complex: 0 (0.00%)
Simple: 200 (100.00%)

validation_medium
Total Entries: 1000
Complex: 0 (0.00%)
Simple: 1000 (100.00%)

validation
Total Entries: 2330
Complex: 0 (0.00%)
Simple: 2330 (100.00%)

test_small
Total Entries: 200
Complex: 0 (0.00%)
Simple: 200 (100.00%)

test_medium
Total Entries: 1000
Complex: 0 (0.00%)
Simple: 1000 (100.00%)

test
Total Entries: 2330
Complex: 0 (0.00%)
Simple: 2330 (100.00%)

