In [1]:
from openai import OpenAI
from typing import Any, Dict, List
import difflib
import subprocess
import tempfile
import os
import re
import pandas as pd

In [7]:
import os
from openai import OpenAI
from typing import Any, Dict, List
import difflib
import subprocess
import tempfile
import re
import pandas as pd

# Asegurar que la clave de API esté disponible desde la variable de entorno
if "OPENAI_API_KEY" not in os.environ:
    raise EnvironmentError("OPENAI_API_KEY not set in environment variables.")

class SASParser:
    def parse(self, sas_code: str) -> Dict[str, Any]:
        """
        Analiza código SAS y extrae una representación intermedia detallada.
        Detecta estructuras comunes: DATA, SET, WHERE, KEEP, DROP, IF, PROC, MACRO.
        """
        blocks = []
        current_block = {}
        macros = []
        in_macro = False
        macro_block = {}
        lines = sas_code.strip().splitlines()

        for line in lines:
            line = line.strip()
            if line.lower().startswith('%macro'):
                in_macro = True
                macro_name = line.split()[1].split('(')[0]
                macro_block = {
                    "type": "macro",
                    "name": macro_name,
                    "definition": []
                }
            elif in_macro and line.lower().startswith('%mend'):
                in_macro = False
                macros.append(macro_block)
                macro_block = {}
            elif in_macro:
                macro_block["definition"].append(line)
            elif line.lower().startswith("data"):
                current_block = {
                    "type": "data_step",
                    "output_table": line.split()[1].strip(';'),
                    "statements": []
                }
            elif line.lower().startswith("set"):
                current_block["input_table"] = line.split()[1].strip(';')
            elif line.lower().startswith("where"):
                current_block["filter"] = line[len("where"):].strip(' ;')
            elif line.lower().startswith("keep"):
                current_block["keep"] = line[len("keep"):].strip(' ;')
            elif line.lower().startswith("drop"):
                current_block["drop"] = line[len("drop"):].strip(' ;')
            elif line.lower().startswith("if"):
                current_block.setdefault("conditions", []).append(line.strip(' ;'))
            elif line.lower().startswith("proc"):
                current_block["proc"] = line.strip(' ;')
            elif line.lower().startswith("run"):
                blocks.append(current_block)
                current_block = {}
            elif line:
                current_block.setdefault("statements", []).append(line.strip(' ;'))

        return {
            "language": "SAS",
            "code": sas_code,
            "purpose": "data analysis / statistics / reporting",
            "structure": blocks,
            "macros": macros
        }

class IntermediateRepresentation:
    def __init__(self, parsed_data: Dict[str, Any]):
        self.language = parsed_data.get("language")
        self.code = parsed_data.get("code")
        self.purpose = parsed_data.get("purpose")
        self.structure = parsed_data.get("structure")
        self.macros = parsed_data.get("macros")

    def describe(self) -> str:
        return f"IR with {len(self.structure)} block(s) and {len(self.macros)} macro(s):\nStructure: {self.structure}\nMacros: {self.macros}"

class CodeGenerator:
    def __init__(self, target_language: str):
        self.target_language = target_language
        self.client = OpenAI()

    def generate(self, ir: IntermediateRepresentation) -> str:
        prompt = f"""
You are a professional developer with deep knowledge of SAS and {self.target_language}.
Translate the following SAS code to full, idiomatic, and functional {self.target_language} code.

- Do NOT simplify or use placeholders like 'pass'.
- Translate all operations and logic as completely as possible.
- Use equivalent libraries (e.g. pandas for Python, dplyr for R) when needed.
- Assume access to datasets like 'sashelp.class'.
- If macros exist, implement equivalent functions or preprocessing logic.

SAS code:
{ir.code}

Structure:
{ir.structure}

Macros:
{ir.macros}

Now translate the full logic to {self.target_language}:
"""
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )
        raw_code = response.choices[0].message.content.strip()

        cleaned = re.sub(r"(?i)^.*?```(?:python|r)?\\n", "", raw_code)
        cleaned = re.sub(r"```.*$", "", cleaned, flags=re.DOTALL)
        cleaned = re.sub(r"^\s*(Python|R) code:\s*", "", cleaned, flags=re.IGNORECASE)

        return cleaned.strip()

class CodeValidator:
    def validate(self, original_code: str, translated_code: str) -> str:
        diff = difflib.unified_diff(
            original_code.splitlines(),
            translated_code.splitlines(),
            fromfile='original',
            tofile='translated',
            lineterm=''
        )
        return '\n'.join(diff)

    def run_translated_code(self, code: str, language: str) -> str:
        suffix = '.py' if language.lower() == 'python' else '.r'
        with tempfile.NamedTemporaryFile(mode='w+', suffix=suffix, delete=False) as temp_file:
            temp_file.write(code)
            temp_file.flush()
            try:
                if language.lower() == 'python':
                    result = subprocess.run(['python3', temp_file.name], capture_output=True, text=True)
                elif language.lower() == 'r':
                    result = subprocess.run(['Rscript', temp_file.name], capture_output=True, text=True)
                else:
                    return f"Unsupported language: {language}"
                return result.stdout + result.stderr
            finally:
                os.remove(temp_file.name)

    def validate_functional_equivalence(self, df_expected: pd.DataFrame, df_actual: pd.DataFrame) -> str:
        try:
            pd.testing.assert_frame_equal(df_expected.reset_index(drop=True), df_actual.reset_index(drop=True))
            return "✅ DataFrames are functionally equivalent."
        except AssertionError as e:
            return f"❌ Functional difference detected:\n{str(e)}"

# --- Ejemplo de uso ---

def run_translation(sas_code: str, target_lang: str):
    parser = SASParser()
    parsed = parser.parse(sas_code)
    ir = IntermediateRepresentation(parsed)
    print("\n[IR Summary]\n", ir.describe())

    validator = CodeValidator()
    print(f"\nTranslating to {target_lang} with OpenAI...")
    generator = CodeGenerator(target_lang)
    translated_code = generator.generate(ir)
    print("\nTranslated code:\n", translated_code)

    diff_report = validator.validate(sas_code, translated_code)
    print("\nDiff report:\n", diff_report)

    print(f"\nRunning {target_lang} code...")
    output = validator.run_translated_code(translated_code, target_lang)
    print("\nExecution output:\n", output)

if __name__ == "__main__":
    sas_code = """
    %macro example();
      data filtered;
        set sashelp.class;
        where age > 13;
        keep name age height;
      run;
    %mend example;
    %example;
    """
    run_translation(sas_code, target_lang="Python")



[IR Summary]
 IR with 0 block(s) and 1 macro(s):
Structure: []
Macros: [{'type': 'macro', 'name': 'example', 'definition': ['data filtered;', 'set sashelp.class;', 'where age > 13;', 'keep name age height;', 'run;']}]

Translating to Python with OpenAI...

Translated code:
 In Python, we can use the pandas library to perform similar operations as in the SAS code. Here is the equivalent Python code:

Diff report:
 --- original
+++ translated
@@ -1,10 +1 @@
-
-    %macro example();
-      data filtered;
-        set sashelp.class;
-        where age > 13;
-        keep name age height;
-      run;
-    %mend example;
-    %example;
-    
+In Python, we can use the pandas library to perform similar operations as in the SAS code. Here is the equivalent Python code:

Running Python code...

Execution output:
   File "/var/folders/j4/p802rxx54qz8sky8x8dn1cbc0000gn/T/tmpn94fxb1r.py", line 1
    In Python, we can use the pandas library to perform similar operations as in the SAS code. Here is

In [5]:

class SASParser:
    def parse(self, sas_code: str) -> Dict[str, Any]:
        """
        Analiza código SAS y extrae una representación intermedia detallada.
        Detecta estructuras comunes: DATA, SET, WHERE, KEEP, DROP, IF, PROC, MACRO.
        """
        blocks = []
        current_block = {}
        macros = []
        in_macro = False
        macro_block = {}
        lines = sas_code.strip().splitlines()

        for line in lines:
            line = line.strip()
            if line.lower().startswith('%macro'):
                in_macro = True
                macro_name = line.split()[1].split('(')[0]
                macro_block = {
                    "type": "macro",
                    "name": macro_name,
                    "definition": []
                }
            elif in_macro and line.lower().startswith('%mend'):
                in_macro = False
                macros.append(macro_block)
                macro_block = {}
            elif in_macro:
                macro_block["definition"].append(line)
            elif line.lower().startswith("data"):
                current_block = {
                    "type": "data_step",
                    "output_table": line.split()[1].strip(';'),
                    "statements": []
                }
            elif line.lower().startswith("set"):
                current_block["input_table"] = line.split()[1].strip(';')
            elif line.lower().startswith("where"):
                current_block["filter"] = line[len("where"):].strip(' ;')
            elif line.lower().startswith("keep"):
                current_block["keep"] = line[len("keep"):].strip(' ;')
            elif line.lower().startswith("drop"):
                current_block["drop"] = line[len("drop"):].strip(' ;')
            elif line.lower().startswith("if"):
                current_block.setdefault("conditions", []).append(line.strip(' ;'))
            elif line.lower().startswith("proc"):
                current_block["proc"] = line.strip(' ;')
            elif line.lower().startswith("run"):
                blocks.append(current_block)
                current_block = {}
            elif line:
                current_block.setdefault("statements", []).append(line.strip(' ;'))

        return {
            "language": "SAS",
            "code": sas_code,
            "purpose": "data analysis / statistics / reporting",
            "structure": blocks,
            "macros": macros
        }

class IntermediateRepresentation:
    def __init__(self, parsed_data: Dict[str, Any]):
        self.language = parsed_data.get("language")
        self.code = parsed_data.get("code")
        self.purpose = parsed_data.get("purpose")
        self.structure = parsed_data.get("structure")
        self.macros = parsed_data.get("macros")

    def describe(self) -> str:
        return f"IR with {len(self.structure)} block(s) and {len(self.macros)} macro(s):\nStructure: {self.structure}\nMacros: {self.macros}"

class CodeGenerator:
    def __init__(self, target_language: str):
        self.target_language = target_language
        self.client = OpenAI()

    def generate(self, ir: IntermediateRepresentation) -> str:
        prompt = f"""
You are a professional developer with deep knowledge of SAS and {self.target_language}.
Translate the following SAS code to full, idiomatic, and functional {self.target_language} code.

- Do NOT simplify or use placeholders like 'pass'.
- Translate all operations and logic as completely as possible.
- Use equivalent libraries (e.g. pandas for Python, dplyr for R) when needed.
- Assume access to datasets like 'sashelp.class'.
- If macros exist, implement equivalent functions or preprocessing logic.


SAS code:
{ir.code}

Structure:
{ir.structure}

Macros:
{ir.macros}

Now translate the full logic to {self.target_language}:
"""
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )
        raw_code = response.choices[0].message.content.strip()

        cleaned = re.sub(r"(?i)^.*?```(?:python|r)?\\n", "", raw_code)
        cleaned = re.sub(r"```.*$", "", cleaned, flags=re.DOTALL)
        cleaned = re.sub(r"^\s*(Python|R) code:\s*", "", cleaned, flags=re.IGNORECASE)

        return cleaned.strip()

class CodeValidator:
    def validate(self, original_code: str, translated_code: str) -> str:
        diff = difflib.unified_diff(
            original_code.splitlines(),
            translated_code.splitlines(),
            fromfile='original',
            tofile='translated',
            lineterm=''
        )
        return '\n'.join(diff)

    def run_translated_code(self, code: str, language: str) -> str:
        suffix = '.py' if language.lower() == 'python' else '.r'
        with tempfile.NamedTemporaryFile(mode='w+', suffix=suffix, delete=False) as temp_file:
            temp_file.write(code)
            temp_file.flush()
            try:
                if language.lower() == 'python':
                    result = subprocess.run(['python3', temp_file.name], capture_output=True, text=True)
                elif language.lower() == 'r':
                    result = subprocess.run(['Rscript', temp_file.name], capture_output=True, text=True)
                else:
                    return f"Unsupported language: {language}"
                return result.stdout + result.stderr
            finally:
                os.remove(temp_file.name)

    def validate_functional_equivalence(self, df_expected: pd.DataFrame, df_actual: pd.DataFrame) -> str:
        try:
            pd.testing.assert_frame_equal(df_expected.reset_index(drop=True), df_actual.reset_index(drop=True))
            return "✅ DataFrames are functionally equivalent."
        except AssertionError as e:
            return f"❌ Functional difference detected:\n{str(e)}"

# --- Ejemplo de uso ---

def run_translation(sas_code: str, target_lang: str):
    parser = SASParser()
    parsed = parser.parse(sas_code)
    ir = IntermediateRepresentation(parsed)
    print("\n[IR Summary]\n", ir.describe())

    validator = CodeValidator()
    print(f"\nTranslating to {target_lang} with OpenAI...")
    generator = CodeGenerator(target_lang)
    translated_code = generator.generate(ir)
    print("\nTranslated code:\n", translated_code)

    diff_report = validator.validate(sas_code, translated_code)
    print("\nDiff report:\n", diff_report)

    print(f"\nRunning {target_lang} code...")
    output = validator.run_translated_code(translated_code, target_lang)
    print("\nExecution output:\n", output)

if __name__ == "__main__":
    sas_code = """
    %macro example();
      data filtered;
        set sashelp.class;
        where age > 13;
        keep name age height;
      run;
    %mend example;
    %example;
    """
    run_translation(sas_code, target_lang="Python")



[IR Summary]
 IR with 0 block(s) and 1 macro(s):
Structure: []
Macros: [{'type': 'macro', 'name': 'example', 'definition': ['data filtered;', 'set sashelp.class;', 'where age > 13;', 'keep name age height;', 'run;']}]

Translating to Python with OpenAI...

Translated code:
 

Diff report:
 --- original
+++ translated
@@ -1,10 +0,0 @@
-
-    %macro example();
-      data filtered;
-        set sashelp.class;
-        where age > 13;
-        keep name age height;
-      run;
-    %mend example;
-    %example;
-    

Running Python code...

Execution output:
 
