In [1]:
import re

def extract_pixi_home(script_lines) -> str:
    """
    Parses script lines to find PIXI_HOME, resolving internal variable 
    references (e.g., $PROJECT_HOME_PATH) defined earlier in the script.
    """
    env_vars = {}
    
    # Pattern to match: export VAR_NAME=VALUE
    # Captures: (Group 1) Var Name, (Group 2) Raw Value
    export_pattern = re.compile(r'^\s*export\s+([a-zA-Z_][a-zA-Z0-9_]*)=(.*)')
    
    # Pattern to find variables to substitute: $VAR or ${VAR}
    var_sub_pattern = re.compile(r'\$\{?([a-zA-Z_][a-zA-Z0-9_]*)\}?')

    for line in script_lines:
        # 1. Clean the line: remove inline comments and whitespace
        line = line.split('#', 1)[0].strip()
        
        # 2. Handle potential YAML formatting artifacts (defensive cleaning)
        # If the input contains raw YAML list items like "- 'export ...'"
        if line.startswith("- "):
            line = line[2:].strip()
        # Remove surrounding quotes for the whole command if present
        if (line.startswith("'") and line.endswith("'")) or \
           (line.startswith('"') and line.endswith('"')):
            line = line[1:-1].strip()

        if not line:
            continue

        # 3. Check for export statements
        match = export_pattern.match(line)
        if match:
            key, raw_value = match.groups()
            
            # Clean up quotes surrounding the value (e.g., "value" -> value)
            value = raw_value.strip()
            if (value.startswith('"') and value.endswith('"')) or \
               (value.startswith("'") and value.endswith("'")):
                value = value[1:-1]
            
            # 4. Resolve variables inside the value
            # This replaces $VAR with its value from env_vars if known
            def resolve_match(m):
                var_name = m.group(1)
                # Return known value, or keep original string if unknown (e.g. $HOME)
                return env_vars.get(var_name, m.group(0))
            
            resolved_value = var_sub_pattern.sub(resolve_match, value)
            
            # Update our state tracking
            env_vars[key] = resolved_value

    # 5. Retrieve result
    pixi_home = env_vars.get("PIXI_HOME")

    if not pixi_home:
        raise ValueError("PIXI_HOME not found in cluster configuration script")

    return pixi_home

In [2]:
raw_yaml_script = """
  script:
    - 'module load ML-bundle/25.04'
    - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi/nano'
    - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
    
    # hydra errors
    - 'export HYDRA_FULL_ERROR=1'
    
    # export pixi variables
    - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
    - 'export PATH="$HOME/.pixi/bin:$PATH"'
    - 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"'
    - 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"'
    - 'export XDG_STATE_HOME="$PROJECT_HOME_PATH/state"'
    
    # activate pixi
    - 'cd "$PIXI_HOME"'
    - 'eval "$(pixi shell-hook)"'
    - 'cd -'
"""

# Convert string to list of lines
script_lines = raw_yaml_script.strip().split('\n')

# Run and print
print(f"Input lines processed: {len(script_lines)}")
result = extract_pixi_home(script_lines)
print(f"Result: {result}")

Input lines processed: 19
Result: $PLG_GROUPS_STORAGE/plggllmeffi/nano/pixi
