In [1]:
!pip install openai pandas



In [None]:
!pip install boto3

In [None]:
import boto3
import json

bedrock = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1'  # change if needed
)

In [None]:
def call_claude_bedrock(prompt, model_id="anthropic.claude-3-sonnet-20240229", max_tokens=4000, temperature=0.3):
    body = {
        "prompt": f"\n\nHuman: {prompt}\n\nAssistant:",
        "max_tokens_to_sample": max_tokens,
        "temperature": temperature,
        "stop_sequences": ["\n\nHuman:"]
    }
    response = bedrock.invoke_model(
        modelId=model_id,
        body=json.dumps(body),
        contentType="application/json",
        accept="application/json"
    )
    result = json.loads(response['body'].read())
    return result['completion'].strip()

In [None]:
USE_CLAUDE_BEDROCK = True

def call_model(prompt):
    if USE_CLAUDE_BEDROCK:
        return call_claude_bedrock(prompt)
    else:
        return call_openai(prompt)

In [6]:
# Used to securely store your API key
from google.colab import userdata

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [7]:
import openai
from openai import OpenAI

# Set your OpenAI API key (use environment variable or hardcode if safe)
#openai.api_key = "your-openai-api-key"
openai = OpenAI(api_key=OPENAI_API_KEY)
model_name = "gpt-4o"


In [8]:
def call_model(prompt: str, model=model_name):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=4000,
        temperature=0.3
    )
    return response.choices[0].message.content

In [9]:

from string import Template

target_language = "Java"  # Options: Java, .NET, Python

cot_prompt_template = Template("""
You are a COBOL modernization assistant.

You will be given:
1. A COBOL copybook (raw code)
2. A JSON file generated by an ANTLR parser describing the hierarchical structure of the copybook

Your task is to translate the data structure into a clean, idiomatic $target_class.
Ensure feature parity:
- Represent every COBOL field in $target_class format
- Model REDEFINES, OCCURS, and nested GROUPS appropriately
- Preserve data types, structure, and any redefined elements
- Retain all metadata (PIC, COMP, SIGN, etc.) in comments or annotations
- If fidelity is lost, insert a TODO or explanation inline

COBOL Source:
\n$cobol_code\n

ANTLR JSON:
\n$antlr_json\n

Start by reasoning step by step and then output the full $target_class.
""")

def build_prompt(cobol_code, antlr_json, language):
    target = {
        "Java": "Java class (POJO)",
        "Python": "Python class (dataclass)",
        ".NET": "C# class (POCO)"
    }.get(language, "Java class (POJO)")
    return cot_prompt_template.substitute(
        target_class=target,
        cobol_code=cobol_code,
        antlr_json=antlr_json
    )


In [13]:
import json
import re
import os
import pandas as pd

def normalize_name(name):
    return name.lower().replace("-", "").replace("_", "")

def extract_fields_from_json(json_str):
    data = json.loads(json_str)
    # Assuming the ANTLR JSON has a structure like {"copybook": {...}, "fields": [...]}
    # If the structure is different, this needs to be adjusted.
    # Based on the error, it seems the 'fields' key might be nested or named differently.
    # Let's assume the structure is {"file": "...", "copybook": {"name": "...", "fields": [...]}}
    # Or perhaps the fields are directly under the root like {"fields": [...]} which is what the original code assumed.
    # Given the KeyError, the 'fields' key is likely nested.
    # Let's try accessing fields under a potential 'copybook' key, or iterate through the top level keys to find a list.
    # A common structure might be something like {"program": {...}, "workingStorage": {"dictionary": {...}, "variables": [...]}}
    # However, the prompt mentions "JSON file generated by an ANTLR parser describing the hierarchical structure of the copybook".
    # This suggests a structure more focused on the data hierarchy.
    # Let's assume the structure is like {"name": "COPYBOOK-NAME", "level": 1, "children": [...]} where children are fields or groups.
    # We need to recursively traverse this structure to find all fields.

    def find_fields(node):
        fields_list = []
        if "level" in node and "name" in node:
             # This looks like a field or group
             field_info = {
                "original_name": node["name"],
                "normalized_name": normalize_name(node["name"]),
                "type": node.get("picClause"), # Assuming picClause holds the type info
                "redefines": node.get("redefines"),
                "occurs": node.get("occurs"),
            }
             fields_list.append(field_info)
        if "children" in node:
            for child in node["children"]:
                fields_list.extend(find_fields(child))
        return fields_list

    # Assuming the root of the JSON is the copybook structure
    return find_fields(data)


def extract_java_field_data(java_code_str):
    fields = []
    # This regex needs to be robust to different access modifiers and potential annotations
    pattern = re.compile(r'(?:public|private|protected)?\s+([\w<>\[\]]+)\s+(\w+)\s*;')
    for match in pattern.findall(java_code_str):
        fields.append({
            "normalized_name": normalize_name(match[1]), # Use the variable name for normalization
            "java_type": match[0] # Use the type
        })
    return fields

def infer_expected_java_type(cobol_type):
    if cobol_type is None:
        return "Unknown"
    # Improved type inference based on common COBOL PIC clauses
    cobol_type_upper = cobol_type.upper()
    if "COMP-3" in cobol_type_upper or "V" in cobol_type_upper: # Packed decimal or decimal with implied decimal point
        return "BigDecimal"
    elif "COMP" in cobol_type_upper: # Binary
         # Need to consider size to determine int, long, etc.
         # For simplicity, let's assume int for now, but this might need refinement
         return "int"
    elif "X" in cobol_type_upper: # Alphanumeric
        return "String"
    elif "9" in cobol_type_upper: # Numeric
         # Need to consider size and signedness
         # For simplicity, let's assume int for now, might need long or BigInteger
         return "int"
    else:
        return "Unknown"


def validate_parity(cobol_fields, java_fields):
    report = []
    java_map = {f["normalized_name"]: f["java_type"] for f in java_fields}
    for f in cobol_fields:
        norm = f["normalized_name"]
        # Check if the normalized COBOL name is a substring of any normalized Java name
        matched = next((j_norm for j_norm in java_map.keys() if norm in j_norm), None)
        match = matched is not None

        expected = infer_expected_java_type(f["type"])
        actual = java_map.get(matched, "❌")
        # Check if the expected type is part of the actual type string (e.g., "List<String>" contains "String")
        type_match = "✅" if expected != "Unknown" and expected in actual else ("⚠️ Check Needed" if expected == "Unknown" else "❌")
        # Check if OCCURS is handled, assuming List is used in Java
        occurs_ok = "✅" if f["occurs"] and "List" in actual else ("❌" if f["occurs"] else "N/A")
        redefine_note = "⚠️ Manual Check Needed" if f["redefines"] else "N/A"
        suggestion = ""
        if not match:
            suggestion = f"private {expected} {norm};" if expected != "Unknown" else f"// TODO: Map COBOL field {f['original_name']} ({f['type']}) to Java"
        elif type_match == "❌":
            suggestion = f"// ⚠️ Consider changing type to: {expected}" if expected != "Unknown" else ""

        report.append({
            "COBOL Field": f["original_name"],
            "COBOL Type": f["type"],
            "OCCURS": f["occurs"],
            "REDEFINES": f["redefines"],
            "Mapped Java Field": matched if match else "❌ Not Found",
            "Java Type": actual,
            "Expected Java Type": expected,
            "Type Match": type_match,
            "OCCURS Used in Java": occurs_ok,
            "REDEFINES Documented": redefine_note,
            "Suggested Java Field": suggestion
        })
    return pd.DataFrame(report)

In [14]:
input_dir = "./batch_input"
output_dir = "./batch_output"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".cob"):
        base = filename.replace(".cob", "")
        cobol_path = os.path.join(input_dir, f"{base}.cob")
        json_path = os.path.join(input_dir, f"{base}.json")

        try:
            with open(cobol_path, "r") as f:
                cobol_code = f.read()
            with open(json_path, "r") as f:
                antlr_json = f.read()
        except:
            print(f"Skipping {base}: missing files")
            continue

        prompt = build_prompt(cobol_code, antlr_json, target_language)
        model_output = call_model(prompt)

        ext = {
            "Java": ".java",
            ".NET": ".cs",
            "Python": ".py"
        }.get(target_language, ".java")

        out_file = os.path.join(output_dir, f"{base}{ext}")
        with open(out_file, "w") as f:
            f.write(model_output)

        cobol_fields = extract_fields_from_json(antlr_json)
        java_fields = extract_java_field_data(model_output)
        report_df = validate_parity(cobol_fields, java_fields)

        report_df.to_excel(os.path.join(output_dir, f"{base}_parity.xlsx"), index=False)
        report_df.to_csv(os.path.join(output_dir, f"{base}_parity.csv"), index=False)

        print(f"✅ Processed: {base} → {out_file}")

✅ Processed: data → ./batch_output/data.java
✅ Processed: perform_test → ./batch_output/perform_test.java
✅ Processed: operations → ./batch_output/operations.java
✅ Processed: file_io_test → ./batch_output/file_io_test.java
✅ Processed: main → ./batch_output/main.java
