In [None]:
!pip install openai pandas boto3 xai-sdk



In [None]:
import boto3

# Optional: manually define keys (not recommended in notebooks)
# boto3.setup_default_session(
#     aws_access_key_id='YOUR_ACCESS_KEY',
#     aws_secret_access_key='YOUR_SECRET_KEY',
#     region_name='us-east-1'  # Or your preferred region
# )

bedrock = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1'  # Choose the correct region
)

In [None]:
# Used to securely store your API key
from google.colab import userdata

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
XAI_API_KEY = userdata.get('XAI_API_KEY')  # Add this line

In [None]:
import openai
from openai import OpenAI

# Set your OpenAI API key (use environment variable or hardcode if safe)
#openai.api_key = "your-openai-api-key"
openai = OpenAI(api_key=OPENAI_API_KEY)
model_name = "gpt-4o"


In [None]:
from xai_sdk import Client
from xai_sdk.chat import user  # Import for message roles

def call_grok(prompt: str, model="grok-4", max_tokens=4000, temperature=0.3):
    if not XAI_API_KEY:
        raise ValueError("XAI_API_KEY not set in Colab secrets.")

    client = Client(api_key=XAI_API_KEY)
    chat = client.chat.create(model=model)

    # System prompt can be added if needed; here we use user prompt directly
    chat.append(user(prompt))

    response = chat.sample(
        max_tokens=max_tokens,
        temperature=temperature
    )
    return response.content

In [None]:
def call_openai(prompt: str, model=model_name):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=4000,
        temperature=0.3
    )
    return response.choices[0].message.content

In [None]:
from string import Template
import yaml

modernization_prompt_template = Template("""
You are a COBOL modernization assistant.

You will be given:
1. A COBOL source file (.cob) containing logic, I/O operations, and business rules
2. A structured JSON file parsed by an ANTLR-based tool describing:
   - Statement types (IF, PERFORM, CALL, DISPLAY, etc.)
   - Line numbers and variable declarations (working-storage)
   - Call graph and procedural structure
3. A call graph in Markdown (.md) indicating external modules (e.g., CALL 'DATAPROGRAM')
4. A modernization map in YAML describing how COBOL technologies (e.g., DB2, MQ, VSAM) map to modern equivalents (e.g., JDBC, Kafka, JSON)

You should use this modernization mapping to guide your translation decisions:
$modern_stack_yaml

Your task is to:
- Translate the full COBOL program into a clean, modular $target_class
- Use the .cob file as the source of truth for string literals and logic syntax
- Use the JSON to guide nesting, control flow, and variable declarations
- Use the call graph to modularize external calls (e.g., create a BankAccount or DataProgram class)

### Requirements:
- Encapsulate global data like `FINAL-BALANCE` using class fields + getter/setter methods
- Replace any external `CALL 'DATAPROGRAM' USING 'READ'` or `'WRITE'` with appropriate class methods
  - Implement DB2-based calls using PostgreSQL JDBC (URL: jdbc:postgresql://localhost:5432/bankdb, user: 'postgres', pass: 'pass', table: 'account' with 'balance' NUMERIC column)
- Use idiomatic $language_code with appropriate control structures and data types
  - ALWAYS use java.math.BigDecimal for COBOL PIC with V or decimals (e.g., 9(6)V99) to avoid precision loss; int/long for pure numerics
  - For flags (PIC X VALUE 'Y'/'N' or 'YES'/'NO'), use boolean
- Add inline comments that trace back to COBOL logic where helpful
- For inputs (ACCEPT/DISPLAY): ALWAYS use java.util.Scanner(System.in); NEVER hard-code/simulate values
- MUST use java.math.BigDecimal for ALL COBOL PIC 9 with V/decimals or where precision matters (e.g., balance); int/long for small/whole numerics.
- For flags (PIC X VALUE 'Y'/'N'/ 'YES'), use boolean (e.g., true/false).
- Inputs: ALWAYS use java.util.Scanner(System.in).nextInt/nextBigDecimal/nextLine; NO hard-codes or simulations.
- DB: PostgreSQL JDBC ONLY, with full implementation (no TODOs): URL jdbc:postgresql://localhost:5432/bankdb, user 'postgres', pass 'pass', table 'account' with 'balance' NUMERIC.
- Tests: Full coverage without TODOs; use Mockito for mocks, ByteArrayInputStream/System.setIn for input simulation.

### Additional Requirement:
- Generate a unit test class (e.g., `OperationsTest.java`) using a modern test framework
  - Use JUnit (Java), pytest (Python), or xUnit (C#)
  - Cover edge cases like insufficient balance, credit update, and balance inquiry
  - Use mocks or stubs for external calls (e.g., mock DB read/write with Mockito)
  - Ensure all business rules are tested in isolation; use java.io.ByteArrayInputStream/System.setIn for input mocks
  - Fully implement tests without TODOs

### Final Output:
- Business logic class (e.g., `Operations.java`)
- Supporting class for persistence or external calls (e.g., `BankAccount.java`)
- Full unit test class with coverage of key flows (e.g., `OperationsTest.java`)
- Ensure the code is idiomatic, testable, and works in a modern $language_code environment
- If fidelity is lost, include `// TODO` with justification

COBOL Source (.cob):
\n$cobol_code\n

Structured JSON (.json):
\n$antlr_json\n

Call Graph (.md):
\n$callgraph_md\n

Start by reasoning step by step, then output the business logic classes and unit tests.
""")

def build_prompt(cobol_code, antlr_json, callgraph_md, language, modernization_yaml_path="modernization_map.yaml"):
    # Load modernization map
    try:
        with open(modernization_yaml_path, "r") as f:
            modernization_map = yaml.safe_load(f)
            modern_stack_yaml = yaml.dump(modernization_map, default_flow_style=False)
    except Exception as e:
        modern_stack_yaml = "# YAML loading failed: " + str(e)

    language_map = {
        "Java": "Java application with JDBC and modular classes",
        "Python": "Python module using classes and file I/O",
        ".NET": "C# console app with POCO and Entity Framework stubs"
    }

    return modernization_prompt_template.substitute(
        target_class=language_map.get(language, "Java application with JDBC and modular classes"),
        modern_stack_yaml=modern_stack_yaml,
        cobol_code=cobol_code,
        antlr_json=antlr_json,
        callgraph_md=callgraph_md,
        language_code=language
    )

In [None]:
import json

def call_claude_bedrock(prompt, model_id="anthropic.claude-3-sonnet-20240229", max_tokens=4000, temperature=0.3):
    body = {
        "prompt": f"\n\nHuman: {prompt}\n\nAssistant:",
        "max_tokens_to_sample": max_tokens,
        "temperature": temperature,
        "stop_sequences": ["\n\nHuman:"]
    }

    response = bedrock.invoke_model(
        modelId=model_id,
        body=json.dumps(body),
        contentType="application/json",
        accept="application/json"
    )

    result = json.loads(response['body'].read())
    return result['completion'].strip()

In [None]:
USE_CLAUDE_BEDROCK = False
USE_GROK = False  # Set to True to use Grok 4

def call_model(prompt):
    if USE_GROK:
        return call_grok(prompt)
    elif USE_CLAUDE_BEDROCK:
        return call_claude_bedrock(prompt)
    else:
        return call_openai(prompt)

In [None]:
import json
import re
import pandas as pd

def normalize_name(name):
    return name.lower().replace("-", "").replace("_", "")

def extract_fields_from_json_llm(antlr_json, model_call_func=call_model):
    prompt = f"""
    You are a COBOL data extractor. Given this JSON from an ANTLR parse of a COBOL program, extract ALL working-storage variables (including groups) as a JSON list of objects.
    Each object MUST have EXACTLY these keys:
    - "original_name": str - the variable name
    - "normalized_name": str - lowercase name without hyphens or underscores
    - "type": str or null - the PIC clause (e.g., "9(6)V99"), null for groups without PIC
    - "redefines": bool - true if REDEFINES present, else false
    - "occurs": int or null - OCCURS count if present, else null

    Handle nested/hierarchical structures recursively (e.g., traverse "children" or "dictionary").
    Output ONLY the JSON array (e.g., [{...}, {...}])—no explanations or code.

    JSON:
    {antlr_json}
    """
    response = model_call_func(prompt)
    try:
        json_start = response.find('[')
        json_end = response.rfind(']') + 1
        json_str = response[json_start:json_end]
        fields = json.loads(json_str)
        if not isinstance(fields, list) or not all(isinstance(f, dict) for f in fields):
            raise ValueError("Invalid JSON response format")
        return fields
    except Exception as e:
        print(f"LLM extraction error: {e} - Falling back to empty list")
        return []

def extract_java_field_data(java_code_str):
    # Pre-filter: Remove lines with 'return' to avoid getter bodies
    lines = [line for line in java_code_str.splitlines() if 'return' not in line]
    cleaned_code = '\n'.join(lines)
    fields = []
    # Improved regex: Allow static, dotted types (e.g., java.math.BigDecimal), complex initializers (non-greedy), more modifiers/whitespace
    pattern = re.compile(r'^\s*(?:public|private|protected|static|final|transient|volatile)?\s+([\w\.<>\[\]]+)\s+(\w+)\s*(?:=\s*.*?)?;\s*(?://.*)?$', re.MULTILINE | re.DOTALL)
    for match in pattern.findall(cleaned_code):
        fields.append({
            "normalized_name": normalize_name(match[1]),
            "java_type": match[0]
        })
    return fields

def infer_expected_java_type(cobol_type):
    if cobol_type is None:
        return "N/A (Group Level)"
    cobol_type_upper = cobol_type.upper()
    if "V" in cobol_type_upper or "COMP-3" in cobol_type_upper:
        return "BigDecimal"
    elif "COMP" in cobol_type_upper:
        digits = re.search(r'\((\d+)\)', cobol_type_upper)
        return "long" if digits and int(digits.group(1)) > 9 else "int"
    elif "X" in cobol_type_upper:
        if "'Y'" in cobol_type_upper or "'N'" in cobol_type_upper or "'YES'" in cobol_type_upper:
            return "boolean"
        return "String"
    elif "9" in cobol_type_upper:
        digits = re.search(r'\((\d+)\)', cobol_type_upper)
        d = int(digits.group(1)) if digits else 1
        return "long" if d > 9 else "int"
    else:
        return "Unknown"

def validate_parity(cobol_fields, java_fields):
    report = []
    java_map = {f["normalized_name"]: f["java_type"] for f in java_fields}
    for f in cobol_fields:
        norm = f["normalized_name"]
        matched = next((j_norm for j_norm in java_map.keys() if norm in j_norm), None)
        match = matched is not None

        expected = infer_expected_java_type(f["type"])
        actual = java_map.get(matched, "❌")
        type_match = "✅" if expected == actual or (expected in ["BigDecimal", "double"] and actual in ["BigDecimal", "double"]) or (expected == "boolean" and actual == "boolean") or (expected in ["int", "long"] and actual in ["int", "long"]) else ("⚠️" if expected == "N/A (Group Level)" else "❌")
        suggestion = ""
        if not match:
            suggestion = f"private {expected} {norm};" if expected not in ["Unknown", "N/A (Group Level)"] else f"// TODO: Map {f['original_name']} ({f['type']}) - Group? Use class/record."
        elif type_match == "❌":
            suggestion = f"// ⚠️ Change to {expected} (type/precision mismatch)"

        report.append({
            "COBOL Field": f["original_name"],
            "COBOL Type": f["type"],
            "OCCURS": f["occurs"],
            "REDEFINES": f["redefines"],
            "Mapped Java Field": matched if match else "❌ Not Found",
            "Java Type": actual,
            "Expected Java Type": expected,
            "Type Match": type_match,
            "OCCURS Used in Java": "N/A",
            "REDEFINES Documented": "N/A",
            "Suggested Java Field": suggestion
        })
    report_df = pd.DataFrame(report)
    if not report_df.empty:
        summary = {"COBOL Field": "Summary", "Type Match": f"{len(report_df[report_df['Type Match'] == '✅'])} ✅ / {len(report_df)} (excl. groups)"}
        report_df = pd.concat([report_df, pd.DataFrame([summary])], ignore_index=True)
    return report_df

In [None]:
import os
input_dir = "./batch_input"
output_dir = "./batch_output"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".cob"):
        base = filename[:-4]  # removes .cob (preserves original casing if needed)
        base_lower = base.lower()  # normalize for matching

        # Find matching files case-insensitively
        file_map = {f.lower(): f for f in os.listdir(input_dir)}

        cobol_path = os.path.join(input_dir, file_map.get(f"{base_lower}.cob", ""))
        json_path = os.path.join(input_dir, file_map.get(f"{base_lower}.json", ""))
        call_graph_path = os.path.join(input_dir, file_map.get(f"callgraph_{base_lower}.md", ""))

        # Validate all required files exist
        if not all(os.path.exists(p) for p in [cobol_path, json_path, call_graph_path]):
            print(f"Skipping {base}: missing input files")
            continue

        target_language = "Java" # Define target_language here

        with open(cobol_path, "r") as f:
            cobol_code = f.read()
        with open(json_path, "r") as f:
            antlr_json = f.read()
        with open(call_graph_path, "r") as f:
            callgraph_md = f.read()

        prompt = build_prompt(cobol_code, antlr_json, callgraph_md, target_language) # Pass target_language to build_prompt
        model_output = call_model(prompt)  # Supports OpenAI or Claude

        ext = {
            "Java": ".java",
            ".NET": ".cs",
            "Python": ".py"
        }.get(target_language, ".java")

        out_file = os.path.join(output_dir, f"{base}{ext}")
        with open(out_file, "w") as f:
            f.write(model_output)

        cobol_fields = extract_fields_from_json_llm(antlr_json)
        java_fields = extract_java_field_data(model_output)
        report_df = validate_parity(cobol_fields, java_fields)

        if not cobol_fields:
            print(f"Warning: No COBOL fields extracted from JSON for {base}")
        if not java_fields:
            print(f"Warning: No Java fields extracted from output for {base}")

        report_df.to_excel(os.path.join(output_dir, f"{base}_parity.xlsx"), index=False)
        report_df.to_csv(os.path.join(output_dir, f"{base}_parity.csv"), index=False, encoding='utf-8')

        print(f"â   Processed: {base} â   {out_file}")

â   Processed: data â   ./batch_output/data.java
â   Processed: main â   ./batch_output/main.java
â   Processed: operations â   ./batch_output/operations.java


In [None]:
import os
import pandas as pd

output_dir = "./batch_output"

for filename in os.listdir(output_dir):
    if filename.lower().endswith(".java"):
        print(f"\n--- Contents of {filename} ---")
        with open(os.path.join(output_dir, filename), "r") as f:
            print(f.read())
        print("-" * (len(filename) + 18)) # Print a separator line

    elif filename.lower().endswith("_parity.csv"):
        print(f"\n--- Contents of {filename} ---")
        try:
            df_csv = pd.read_csv(os.path.join(output_dir, filename))
            display(df_csv)
        except Exception as e:
            print(f"Could not read CSV file {filename}: {e}")
        print("-" * (len(filename) + 18)) # Print a separator line

    elif filename.lower().endswith("_parity.xlsx"):
        print(f"\n--- Contents of {filename} ---")
        try:
            df_excel = pd.read_excel(os.path.join(output_dir, filename))
            display(df_excel)
        except Exception as e:
            print(f"Could not read Excel file {filename}: {e}")
        print("-" * (len(filename) + 18)) # Print a separator line


--- Contents of main_parity.csv ---


Unnamed: 0,COBOL Field,COBOL Type,OCCURS,REDEFINES,Mapped Java Field,Java Type,Expected Java Type,Type Match,OCCURS Used in Java,REDEFINES Documented,Suggested Java Field
0,USER-CHOICE,9,,False,userchoice,int,int,✅,,,
1,CONTINUE-FLAG,X(3),,False,continueflag,boolean,String,❌,,,// ⚠️ Change to String (type/precision mismatch)
2,Summary,,,,,,,1 ✅ / 2 (excl. groups),,,


---------------------------------

--- Contents of main_parity.xlsx ---


Unnamed: 0,COBOL Field,COBOL Type,OCCURS,REDEFINES,Mapped Java Field,Java Type,Expected Java Type,Type Match,OCCURS Used in Java,REDEFINES Documented,Suggested Java Field
0,USER-CHOICE,9,,0.0,userchoice,int,int,✅,,,
1,CONTINUE-FLAG,X(3),,0.0,continueflag,boolean,String,❌,,,// ⚠️ Change to String (type/precision mismatch)
2,Summary,,,,,,,1 ✅ / 2 (excl. groups),,,


----------------------------------

--- Contents of data.java ---
To modernize the provided COBOL program into a Java application, we need to follow a structured approach. We'll create a Java class to handle the business logic and another class to manage database interactions. Additionally, we'll implement unit tests to ensure the functionality is preserved and correctly translated.

### Step-by-Step Translation

1. **Identify the Components:**
   - **COBOL Variables:**
     - `STORAGE-BALANCE`: A numeric value with two decimal places, initialized to 1000.00.
     - `OPERATION-TYPE`: A string that determines the operation ('READ' or 'WRITE').
     - `PASSED-OPERATION`: A string passed to the program.
     - `BALANCE`: A numeric value with two decimal places.

2. **Translate Data Structures:**
   - Use `java.math.BigDecimal` for `STORAGE-BALANCE` and `BALANCE` to maintain precision.
   - Use `String` for `OPERATION-TYPE` and `PASSED-OPERATION`.

3. **Translate Logic:**
   - Implement th

Unnamed: 0,COBOL Field,COBOL Type,OCCURS,REDEFINES,Mapped Java Field,Java Type,Expected Java Type,Type Match,OCCURS Used in Java,REDEFINES Documented,Suggested Java Field
0,PASSED-OPERATION,PICX(6),,0.0,❌ Not Found,❌,String,❌,,,private String passedoperation;
1,AMOUNT,PIC9(6)V99,,0.0,amount,BigDecimal,BigDecimal,✅,,,
2,FINAL-BALANCE,PIC9(6)V99,,0.0,finalbalance,BigDecimal,BigDecimal,✅,,,
3,OPERATION-TYPE,PICX(6),,0.0,operationtype,String,String,✅,,,
4,Summary,,,,,,,3 ✅ / 4 (excl. groups),,,


----------------------------------------

--- Contents of main.java ---
To modernize the COBOL program into a Java application, we need to follow a structured approach. We'll create a main class to handle the user interaction and control flow, and a separate class to handle operations related to the account management. We'll also set up a PostgreSQL database connection using JDBC for operations that would have been handled by DB2 in COBOL.

### Step-by-Step Translation

1. **Main Class (`MainProgram.java`)**:
   - This class will handle user input and control flow, similar to the COBOL `PERFORM UNTIL` loop.
   - We'll use `Scanner` for input and `System.out.println` for output.

2. **Operations Class (`Operations.java`)**:
   - This class will encapsulate the logic for viewing, crediting, and debiting the account.
   - We'll use `java.math.BigDecimal` for handling monetary values to maintain precision.
   - Database operations will be handled using JDBC.

3. **Database Connection**:
  

Unnamed: 0,COBOL Field,COBOL Type,OCCURS,REDEFINES,Mapped Java Field,Java Type,Expected Java Type,Type Match,OCCURS Used in Java,REDEFINES Documented,Suggested Java Field
0,STORAGE-BALANCE,9(6)V99,,0.0,storagebalance,BigDecimal,BigDecimal,✅,,,
1,PASSED-OPERATION,X(6),,0.0,❌ Not Found,❌,String,❌,,,private String passedoperation;
2,OPERATION-TYPE,X(6),,0.0,operationtype,String,String,✅,,,
3,BALANCE,9(6)V99,,0.0,storagebalance,BigDecimal,BigDecimal,✅,,,
4,Summary,,,,,,,3 ✅ / 4 (excl. groups),,,


----------------------------------

--- Contents of data_parity.csv ---


Unnamed: 0,COBOL Field,COBOL Type,OCCURS,REDEFINES,Mapped Java Field,Java Type,Expected Java Type,Type Match,OCCURS Used in Java,REDEFINES Documented,Suggested Java Field
0,STORAGE-BALANCE,9(6)V99,,False,storagebalance,BigDecimal,BigDecimal,✅,,,
1,PASSED-OPERATION,X(6),,False,❌ Not Found,❌,String,❌,,,private String passedoperation;
2,OPERATION-TYPE,X(6),,False,operationtype,String,String,✅,,,
3,BALANCE,9(6)V99,,False,storagebalance,BigDecimal,BigDecimal,✅,,,
4,Summary,,,,,,,3 ✅ / 4 (excl. groups),,,


---------------------------------

--- Contents of operations.java ---
To modernize the COBOL program into a Java application, we need to follow a structured approach. We'll create two main Java classes: `Operations` for business logic and `BankAccount` for handling database interactions. We'll also create a unit test class `OperationsTest` to ensure our logic is correct.

### Step-by-Step Translation

1. **Data Handling**: Convert COBOL data types to Java equivalents.
   - `OPERATION-TYPE` and `PASSED-OPERATION` are strings.
   - `AMOUNT` and `FINAL-BALANCE` are `BigDecimal` for precision.

2. **Control Flow**: Use Java's `if-else` statements to replicate COBOL's conditional logic.

3. **External Calls**: Replace COBOL `CALL` with Java method calls.
   - We'll create a `BankAccount` class to handle database operations using JDBC.

4. **Input/Output**: Use `Scanner` for input and `System.out.println` for output.

5. **Database Operations**: Implement JDBC to interact with a PostgreSQL 

Unnamed: 0,COBOL Field,COBOL Type,OCCURS,REDEFINES,Mapped Java Field,Java Type,Expected Java Type,Type Match,OCCURS Used in Java,REDEFINES Documented,Suggested Java Field
0,PASSED-OPERATION,PICX(6),,False,❌ Not Found,❌,String,❌,,,private String passedoperation;
1,AMOUNT,PIC9(6)V99,,False,amount,BigDecimal,BigDecimal,✅,,,
2,FINAL-BALANCE,PIC9(6)V99,,False,finalbalance,BigDecimal,BigDecimal,✅,,,
3,OPERATION-TYPE,PICX(6),,False,operationtype,String,String,✅,,,
4,Summary,,,,,,,3 ✅ / 4 (excl. groups),,,


---------------------------------------
