In [17]:
import re
import json
import os

def create_comprehensive_training_data(cobol_file_path, output_file_path):
    # Load the COBOL file content
    with open(cobol_file_path, 'r', encoding='latin1', errors='ignore') as file:
        file_contents = file.read()

    # Updated pattern definitions for various COBOL structures, including two lines after each match
    patterns = {
        "metadata_comments": re.compile(r'(\*AUTHOR.*|UPDATE\s+[A-Za-z0-9 ]{3,})(?:\n.*){0,2}', re.IGNORECASE),
        "module_interaction": re.compile(r'(CALLS?\s+THE\s+[A-Z0-9_-]+\s+MODULES?)(?:\n.*){0,2}', re.IGNORECASE),
        "responsibility_documentation": re.compile(r'(^\*.*RESPONSIBILITY.*)(?:\n.*){0,2}', re.MULTILINE),
        "procedure_outline": re.compile(r'(\*\s+THE PROGRAM WILL:.*\n(?:\*\s+\d+\..*)+)(?:\n.*){0,2}', re.MULTILINE),
        "code_snippet_with_comments": re.compile(r'((\*.*\n)+.*\n(?:.*\n){0,2})', re.MULTILINE),
        "error_pattern": re.compile(r'(\b(INVALID|NOT NUMERIC|ERROR)[A-Za-z0-9 ,.]*)(?:\n.*){0,2}', re.IGNORECASE),
        "data_structure": re.compile(r'(^\s*\d+\s+01\s+[A-Z0-9-]{3,})(?:\n.*){0,2}', re.MULTILINE),
        "plain_language_documentation": re.compile(r'(^\s*\*\s+[A-Za-z0-9 ,]+)(?:\n.*){0,2}', re.MULTILINE),
        "initialization_pattern": re.compile(r'(MOVE\s+[\'"A-Za-z0-9 ]+\s+TO\s+[A-Z0-9-]+)(?:\n.*){0,2}', re.MULTILINE),
        "bulk_initialization": re.compile(r'(MOVE\s+ALL\s+[\'"0]+\s+TO\s+([A-Z0-9-]+\s*)+)(?:\n.*){0,2}', re.MULTILINE),
        "control_flow_with_thru": re.compile(r'(PERFORM\s+[A-Z0-9-]+\s+THRU\s+[A-Z0-9-]+)(?:\n.*){0,2}', re.MULTILINE),
        "conditional_logic": re.compile(r'(^\s*\d+\s+IF\s+.*)(?:\n.*){0,2}', re.MULTILINE),
        "goto_statement": re.compile(r'(^\s*\d+\s+GOTO\s+[A-Z0-9-]+)(?:\n.*){0,2}', re.MULTILINE),
        "perform_statement": re.compile(r'(^\s*\d+\s+PERFORM\s+[A-Z0-9-]+)(?:\n.*){0,2}', re.MULTILINE),
        "procedure_call": re.compile(r'(^\s*\d+\s+CALL\s+[A-Z0-9_-]+)(?:\n.*){0,2}', re.MULTILINE),
        "section_header": re.compile(r'(^\s*\d+\s+[A-Z0-9-]+-\w+)(?:\n.*){0,2}', re.MULTILINE)
    }

    exclusion_patterns = re.compile(r'^\s*[\*|-]{2,}\s*$', re.MULTILINE)
    training_data = []
    seen_entries = set()

    # Extract and structure data for each pattern
    for category, pattern in patterns.items():
        matches = pattern.findall(file_contents)
        for match in matches:
            if isinstance(match, tuple):  # Handle case where match is a tuple from findall
                match = match[0]
            if exclusion_patterns.match(match):
                continue
            entry = {
                "type": category,
                "description": f"{category.replace('_', ' ').capitalize()} example",
                "code": match.strip(),
                "comments": ""
            }
            if entry["code"].strip().upper() == "ERROR":
                continue
            if category == "metadata_comments":
                entry["comments"] = "Metadata and author information about the program."
            elif category == "module_interaction":
                entry["comments"] = "Indicates interaction with external modules for modular functionality."
            elif category == "responsibility_documentation":
                entry["comments"] = "Documentation of responsibility, usually at the beginning of the program, for maintenance and accuracy."
            elif category == "procedure_outline":
                entry["comments"] = "Step-by-step procedural outline describing the program’s tasks."
            elif category == "code_snippet_with_comments":
                entry["comments"] = "Example code snippet with accompanying comments."
            elif category == "error_pattern":
                entry["comments"] = "Error or validation pattern to ensure data integrity."
            elif category == "data_structure":
                entry["comments"] = "Definition of a data structure within the Data Division."
            elif category == "plain_language_documentation":
                entry["comments"] = "Plain language documentation or setup instruction."
            elif category == "initialization_pattern":
                entry["comments"] = "Sets variable to an initial value to establish default states."
            elif category == "bulk_initialization":
                entry["comments"] = "Initializes multiple variables at once to ensure a clean state for processing."
            elif category == "control_flow_with_thru":
                entry["comments"] = "Performs a range of steps in sequence using the THRU construct."
            elif category == "conditional_logic":
                entry["comments"] = "Executes code based on specific conditions, such as flag checks or value validation."
            elif category == "goto_statement":
                entry["comments"] = "GOTO statement for branching control flow to a specific section."
            elif category == "perform_statement":
                entry["comments"] = "PERFORM statement to invoke a specific routine or section."
            elif category == "procedure_call":
                entry["comments"] = "Calls an external procedure or subroutine, typically for modular functionality."
            elif category == "section_header":
                entry["comments"] = "Section header, indicating the start of a new code block or routine."

            entry_key = (entry["type"], entry["code"], entry["description"])
            if entry_key not in seen_entries:
                seen_entries.add(entry_key)
                training_data.append(entry)

    with open(output_file_path, 'w') as output_file:
        json.dump(training_data, output_file, indent=4)

    print(f"Comprehensive training data saved to {output_file_path}")

def process_all_files_in_directory():
    cobol_directory = '/Users/kjevaji/Code/langchain-examples/_cobol'
    output_directory = '/Users/kjevaji/Code/langchain-examples/output'
    os.makedirs(output_directory, exist_ok=True)
    for cobol_file_name in os.listdir(cobol_directory):
        cobol_file_path = os.path.join(cobol_directory, cobol_file_name)
        output_file_path = os.path.join(output_directory, cobol_file_name)
        try:
            create_comprehensive_training_data(cobol_file_path, output_file_path)
        except UnicodeDecodeError:
            print(f"Skipping file due to encoding error: {cobol_file_name}")

process_all_files_in_directory()


Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL119
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL14B
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPDRV215
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL156
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL135
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL125
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL163


In [15]:
cobol_file_name = 'PPCAL163'
# Replace with the path to your COBOL source file
cobol_file_path = f'/Users/kjevaji/Code/langchain-examples/_cobol/{cobol_file_name}'
# Output path for the training data JSON
output_file_path = f'/Users/kjevaji/Code/langchain-examples/output/{cobol_file_name}'
create_comprehensive_training_data(cobol_file_path, output_file_path)


Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL163


In [9]:
import os

# Define the directories
cobol_directory = '/Users/kjevaji/Code/langchain-examples/_cobol'
output_directory = '/Users/kjevaji/Code/langchain-examples/output'

# Ensure output directory exists
os.makedirs(output_directory, exist_ok=True)

# Loop over all files in the COBOL directory
for cobol_file_name in os.listdir(cobol_directory):
    cobol_file_path = os.path.join(cobol_directory, cobol_file_name)
    output_file_path = os.path.join(output_directory, cobol_file_name)

    # Run the function on each file
    try:
        create_comprehensive_training_data(cobol_file_path, output_file_path)
    except UnicodeDecodeError:
        print(f"Skipping file due to encoding error: {cobol_file_name}")


In [18]:
import re
import json
import os
import random

def create_comprehensive_training_data(cobol_file_path, output_file_path):
    # Load the COBOL file content
    with open(cobol_file_path, 'r', encoding='latin1', errors='ignore') as file:
        file_contents = file.read()

    # Pattern definitions for various COBOL structures, capturing an additional 2 to 7 lines after each match
    def extended_pattern(base_pattern):
        return re.compile(
            base_pattern + r'((?:.*\n){' + str(random.randint(2, 7)) + '})?',
            re.IGNORECASE | re.MULTILINE
        )

    patterns = {
        "metadata_comments": extended_pattern(r'\*AUTHOR.*|UPDATE\s+[A-Za-z0-9 ]{3,}'),
        "module_interaction": extended_pattern(r'CALLS?\s+THE\s+[A-Z0-9_-]+\s+MODULES?'),
        "responsibility_documentation": extended_pattern(r'^\*.*RESPONSIBILITY.*'),
        "procedure_outline": extended_pattern(r'\*\s+THE PROGRAM WILL:.*\n(\*\s+\d+\.\s+[A-Za-z0-9 ,]+.*\n)+'),
        "code_snippet_with_comments": extended_pattern(r'(\*.*\n)+.*\n'),
        "error_pattern": extended_pattern(r'\b(INVALID|NOT NUMERIC|ERROR)[A-Za-z0-9 ,.]*'),
        "data_structure": extended_pattern(r'^\s*\d+\s+01\s+[A-Z0-9-]{3,}'),
        "plain_language_documentation": extended_pattern(r'^\s*\*\s+[A-Za-z0-9 ,]+'),
        "initialization_pattern": extended_pattern(r'MOVE\s+[\'"A-Za-z0-9 ]+\s+TO\s+[A-Z0-9-]+'),
        "bulk_initialization": extended_pattern(r'MOVE\s+ALL\s+[\'"0]+\s+TO\s+([A-Z0-9-]+\s*)+'),
        "control_flow_with_thru": extended_pattern(r'PERFORM\s+[A-Z0-9-]+\s+THRU\s+[A-Z0-9-]+'),
        "conditional_logic": extended_pattern(r'^\s*\d+\s+IF\s+.*'),
        "goto_statement": extended_pattern(r'^\s*\d+\s+GOTO\s+[A-Z0-9-]+'),
        "perform_statement": extended_pattern(r'^\s*\d+\s+PERFORM\s+[A-Z0-9-]+'),
        "procedure_call": extended_pattern(r'^\s*\d+\s+CALL\s+[A-Z0-9_-]+'),
        "section_header": extended_pattern(r'^\s*\d+\s+[A-Z0-9-]+-\w+')
    }

    exclusion_patterns = re.compile(r'^\s*[\*|-]{2,}\s*$', re.MULTILINE)
    training_data = []
    seen_entries = set()

    # Extract and structure data for each pattern
    for category, pattern in patterns.items():
        matches = pattern.finditer(file_contents)
        for match in matches:
            match_text = match.group().strip()
            if exclusion_patterns.match(match_text):
                continue
            entry = {
                "type": category,
                "description": f"{category.replace('_', ' ').capitalize()} example",
                "code": match_text,
                "comments": ""
            }
            if entry["code"].strip().upper() == "ERROR":
                continue
            if category == "metadata_comments":
                entry["comments"] = "Metadata and author information about the program."
            elif category == "module_interaction":
                entry["comments"] = "Indicates interaction with external modules for modular functionality."
            elif category == "responsibility_documentation":
                entry["comments"] = "Documentation of responsibility, usually at the beginning of the program, for maintenance and accuracy."
            elif category == "procedure_outline":
                entry["comments"] = "Step-by-step procedural outline describing the program’s tasks."
            elif category == "code_snippet_with_comments":
                entry["comments"] = "Example code snippet with accompanying comments."
            elif category == "error_pattern":
                entry["comments"] = "Error or validation pattern to ensure data integrity."
            elif category == "data_structure":
                entry["comments"] = "Definition of a data structure within the Data Division."
            elif category == "plain_language_documentation":
                entry["comments"] = "Plain language documentation or setup instruction."
            elif category == "initialization_pattern":
                entry["comments"] = "Sets variable to an initial value to establish default states."
            elif category == "bulk_initialization":
                entry["comments"] = "Initializes multiple variables at once to ensure a clean state for processing."
            elif category == "control_flow_with_thru":
                entry["comments"] = "Performs a range of steps in sequence using the THRU construct."
            elif category == "conditional_logic":
                entry["comments"] = "Executes code based on specific conditions, such as flag checks or value validation."
            elif category == "goto_statement":
                entry["comments"] = "GOTO statement for branching control flow to a specific section."
            elif category == "perform_statement":
                entry["comments"] = "PERFORM statement to invoke a specific routine or section."
            elif category == "procedure_call":
                entry["comments"] = "Calls an external procedure or subroutine, typically for modular functionality."
            elif category == "section_header":
                entry["comments"] = "Section header, indicating the start of a new code block or routine."

            entry_key = (entry["type"], entry["code"], entry["description"])
            if entry_key not in seen_entries:
                seen_entries.add(entry_key)
                training_data.append(entry)

    with open(output_file_path, 'w') as output_file:
        json.dump(training_data, output_file, indent=4)

    print(f"Comprehensive training data saved to {output_file_path}")

def process_all_files_in_directory():
    cobol_directory = '/Users/kjevaji/Code/langchain-examples/_cobol'
    output_directory = '/Users/kjevaji/Code/langchain-examples/output'
    os.makedirs(output_directory, exist_ok=True)
    for cobol_file_name in os.listdir(cobol_directory):
        cobol_file_path = os.path.join(cobol_directory, cobol_file_name)
        output_file_path = os.path.join(output_directory, cobol_file_name)
        try:
            create_comprehensive_training_data(cobol_file_path, output_file_path)
        except UnicodeDecodeError:
            print(f"Skipping file due to encoding error: {cobol_file_name}")

process_all_files_in_directory()


Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL119
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL14B
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPDRV215
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL156
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL135
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL125
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL163


In [19]:
import re
import json
import os
import random

def create_comprehensive_training_data(cobol_file_path, output_file_path):
    # Load the COBOL file content
    with open(cobol_file_path, 'r', encoding='latin1', errors='ignore') as file:
        file_contents = file.read()

    # Base pattern definitions without extra lines
    base_patterns = {
        "metadata_comments": r'\*AUTHOR.*|UPDATE\s+[A-Za-z0-9 ]{3,}',
        "module_interaction": r'CALLS?\s+THE\s+[A-Z0-9_-]+\s+MODULES?',
        "responsibility_documentation": r'^\*.*RESPONSIBILITY.*',
        "procedure_outline": r'\*\s+THE PROGRAM WILL:.*\n(\*\s+\d+\.\s+[A-Za-z0-9 ,]+.*\n)+',
        "code_snippet_with_comments": r'(\*.*\n)+.*\n',
        "error_pattern": r'\b(INVALID|NOT NUMERIC|ERROR)[A-Za-z0-9 ,.]*',
        "data_structure": r'^\s*\d+\s+01\s+[A-Z0-9-]{3,}',
        "plain_language_documentation": r'^\s*\*\s+[A-Za-z0-9 ,]+',
        "initialization_pattern": r'MOVE\s+[\'"A-Za-z0-9 ]+\s+TO\s+[A-Z0-9-]+',
        "bulk_initialization": r'MOVE\s+ALL\s+[\'"0]+\s+TO\s+([A-Z0-9-]+\s*)+',
        "control_flow_with_thru": r'PERFORM\s+[A-Z0-9-]+\s+THRU\s+[A-Z0-9-]+',
        "conditional_logic": r'^\s*\d+\s+IF\s+.*',
        "goto_statement": r'^\s*\d+\s+GOTO\s+[A-Z0-9-]+',
        "perform_statement": r'^\s*\d+\s+PERFORM\s+[A-Z0-9-]+',
        "procedure_call": r'^\s*\d+\s+CALL\s+[A-Z0-9_-]+',
        "section_header": r'^\s*\d+\s+[A-Z0-9-]+-\w+'
    }

    exclusion_patterns = re.compile(r'^\s*[\*|-]{2,}\s*$', re.MULTILINE)
    training_data = []
    seen_entries = set()

    # Extract and structure data for each pattern
    for category, base_pattern in base_patterns.items():
        pattern = re.compile(base_pattern, re.IGNORECASE | re.MULTILINE)
        matches = pattern.finditer(file_contents)

        for match in matches:
            match_text = match.group().strip()

            # Generate a random number of lines (2-7) to include after each match
            extra_lines = random.randint(2, 7)
            match_end_pos = match.end()

            # Capture the additional lines following the match
            additional_text = "\n".join(file_contents[match_end_pos:].splitlines()[:extra_lines])
            full_match_text = f"{match_text}\n{additional_text}".strip()

            # Skip entries matching edge cases
            if exclusion_patterns.match(full_match_text):
                continue

            entry = {
                "type": category,
                "description": f"{category.replace('_', ' ').capitalize()} example",
                "code": full_match_text,
                "comments": ""
            }

            if entry["code"].strip().upper() == "ERROR":
                continue

            # Define comments based on the pattern category
            if category == "metadata_comments":
                entry["comments"] = "Metadata and author information about the program."
            elif category == "module_interaction":
                entry["comments"] = "Indicates interaction with external modules for modular functionality."
            elif category == "responsibility_documentation":
                entry["comments"] = "Documentation of responsibility, usually at the beginning of the program, for maintenance and accuracy."
            elif category == "procedure_outline":
                entry["comments"] = "Step-by-step procedural outline describing the program’s tasks."
            elif category == "code_snippet_with_comments":
                entry["comments"] = "Example code snippet with accompanying comments."
            elif category == "error_pattern":
                entry["comments"] = "Error or validation pattern to ensure data integrity."
            elif category == "data_structure":
                entry["comments"] = "Definition of a data structure within the Data Division."
            elif category == "plain_language_documentation":
                entry["comments"] = "Plain language documentation or setup instruction."
            elif category == "initialization_pattern":
                entry["comments"] = "Sets variable to an initial value to establish default states."
            elif category == "bulk_initialization":
                entry["comments"] = "Initializes multiple variables at once to ensure a clean state for processing."
            elif category == "control_flow_with_thru":
                entry["comments"] = "Performs a range of steps in sequence using the THRU construct."
            elif category == "conditional_logic":
                entry["comments"] = "Executes code based on specific conditions, such as flag checks or value validation."
            elif category == "goto_statement":
                entry["comments"] = "GOTO statement for branching control flow to a specific section."
            elif category == "perform_statement":
                entry["comments"] = "PERFORM statement to invoke a specific routine or section."
            elif category == "procedure_call":
                entry["comments"] = "Calls an external procedure or subroutine, typically for modular functionality."
            elif category == "section_header":
                entry["comments"] = "Section header, indicating the start of a new code block or routine."

            entry_key = (entry["type"], entry["code"], entry["description"])
            if entry_key not in seen_entries:
                seen_entries.add(entry_key)
                training_data.append(entry)

    with open(output_file_path, 'w') as output_file:
        json.dump(training_data, output_file, indent=4)

    print(f"Comprehensive training data saved to {output_file_path}")

def process_all_files_in_directory():
    cobol_directory = '/Users/kjevaji/Code/langchain-examples/_cobol'
    output_directory = '/Users/kjevaji/Code/langchain-examples/output'
    os.makedirs(output_directory, exist_ok=True)
    for cobol_file_name in os.listdir(cobol_directory):
        cobol_file_path = os.path.join(cobol_directory, cobol_file_name)
        output_file_path = os.path.join(output_directory, cobol_file_name)
        try:
            create_comprehensive_training_data(cobol_file_path, output_file_path)
        except UnicodeDecodeError:
            print(f"Skipping file due to encoding error: {cobol_file_name}")

process_all_files_in_directory()


Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL119
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL14B
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPDRV215
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL156
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL135
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL125
Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/PPCAL163


In [21]:
import re
import json
import os
import random

def create_comprehensive_training_data(cobol_file_path):
    # Load the COBOL file content
    with open(cobol_file_path, 'r', encoding='latin1', errors='ignore') as file:
        file_contents = file.read()

    # Base pattern definitions without extra lines
    base_patterns = {
        "metadata_comments": r'\*AUTHOR.*|UPDATE\s+[A-Za-z0-9 ]{3,}',
        "module_interaction": r'CALLS?\s+THE\s+[A-Z0-9_-]+\s+MODULES?',
        "responsibility_documentation": r'^\*.*RESPONSIBILITY.*',
        "procedure_outline": r'\*\s+THE PROGRAM WILL:.*\n(\*\s+\d+\.\s+[A-Za-z0-9 ,]+.*\n)+',
        "code_snippet_with_comments": r'(\*.*\n)+.*\n',
        "error_pattern": r'\b(INVALID|NOT NUMERIC|ERROR)[A-Za-z0-9 ,.]*',
        "data_structure": r'^\s*\d+\s+01\s+[A-Z0-9-]{3,}',
        "plain_language_documentation": r'^\s*\*\s+[A-Za-z0-9 ,]+',
        "initialization_pattern": r'MOVE\s+[\'"A-Za-z0-9 ]+\s+TO\s+[A-Z0-9-]+',
        "bulk_initialization": r'MOVE\s+ALL\s+[\'"0]+\s+TO\s+([A-Z0-9-]+\s*)+',
        "control_flow_with_thru": r'PERFORM\s+[A-Z0-9-]+\s+THRU\s+[A-Z0-9-]+',
        "conditional_logic": r'^\s*\d+\s+IF\s+.*',
        "goto_statement": r'^\s*\d+\s+GOTO\s+[A-Z0-9-]+',
        "perform_statement": r'^\s*\d+\s+PERFORM\s+[A-Z0-9-]+',
        "procedure_call": r'^\s*\d+\s+CALL\s+[A-Z0-9_-]+',
        "section_header": r'^\s*\d+\s+[A-Z0-9-]+-\w+'
    }

    exclusion_patterns = re.compile(r'^\s*[\*|-]{2,}\s*$', re.MULTILINE)
    training_data = []
    seen_entries = set()

    # Extract and structure data for each pattern
    for category, base_pattern in base_patterns.items():
        pattern = re.compile(base_pattern, re.IGNORECASE | re.MULTILINE)
        matches = pattern.finditer(file_contents)

        for match in matches:
            match_text = match.group().strip()

            # Generate a random number of lines (2-7) to include after each match
            extra_lines = 0
            match_end_pos = match.end()

            # Capture the additional lines following the match
            additional_text = "\n".join(file_contents[match_end_pos:].splitlines()[:extra_lines])
            full_match_text = f"{match_text}\n{additional_text}".strip()

            # Skip entries matching edge cases
            if exclusion_patterns.match(full_match_text):
                continue

            entry = {
                "type": category,
                "description": f"{category.replace('_', ' ').capitalize()} example",
                "code": full_match_text,
                "comments": ""
            }

            if entry["code"].strip().upper() == "ERROR":
                continue

            # Define comments based on the pattern category
            if category == "metadata_comments":
                entry["comments"] = "Metadata and author information about the program."
            elif category == "module_interaction":
                entry["comments"] = "Indicates interaction with external modules for modular functionality."
            elif category == "responsibility_documentation":
                entry["comments"] = "Documentation of responsibility, usually at the beginning of the program, for maintenance and accuracy."
            elif category == "procedure_outline":
                entry["comments"] = "Step-by-step procedural outline describing the program’s tasks."
            elif category == "code_snippet_with_comments":
                entry["comments"] = "Example code snippet with accompanying comments."
            elif category == "error_pattern":
                entry["comments"] = "Error or validation pattern to ensure data integrity."
            elif category == "data_structure":
                entry["comments"] = "Definition of a data structure within the Data Division."
            elif category == "plain_language_documentation":
                entry["comments"] = "Plain language documentation or setup instruction."
            elif category == "initialization_pattern":
                entry["comments"] = "Sets variable to an initial value to establish default states."
            elif category == "bulk_initialization":
                entry["comments"] = "Initializes multiple variables at once to ensure a clean state for processing."
            elif category == "control_flow_with_thru":
                entry["comments"] = "Performs a range of steps in sequence using the THRU construct."
            elif category == "conditional_logic":
                entry["comments"] = "Executes code based on specific conditions, such as flag checks or value validation."
            elif category == "goto_statement":
                entry["comments"] = "GOTO statement for branching control flow to a specific section."
            elif category == "perform_statement":
                entry["comments"] = "PERFORM statement to invoke a specific routine or section."
            elif category == "procedure_call":
                entry["comments"] = "Calls an external procedure or subroutine, typically for modular functionality."
            elif category == "section_header":
                entry["comments"] = "Section header, indicating the start of a new code block or routine."

            entry_key = (entry["type"], entry["code"], entry["description"])
            if entry_key not in seen_entries:
                seen_entries.add(entry_key)
                training_data.append(entry)

    return training_data

def process_all_files_in_directory():
    cobol_directory = '/Users/kjevaji/Code/langchain-examples/_cobol'
    all_training_data = []

    for cobol_file_name in os.listdir(cobol_directory):
        cobol_file_path = os.path.join(cobol_directory, cobol_file_name)
        try:
            all_training_data.extend(create_comprehensive_training_data(cobol_file_path))
        except UnicodeDecodeError:
            print(f"Skipping file due to encoding error: {cobol_file_name}")

    # Write all collected training data to a single output file
    output_file_path = '/Users/kjevaji/Code/langchain-examples/output/output.json'
    with open(output_file_path, 'w') as output_file:
        json.dump(all_training_data, output_file, indent=4)

    print(f"Comprehensive training data saved to {output_file_path}")

process_all_files_in_directory()


Comprehensive training data saved to /Users/kjevaji/Code/langchain-examples/output/output.json
