In [5]:
import stim
import numpy as np

# --- Define environment ---
class QLDPCEnv:
    def __init__(self, p=0.05):
        self.p = p
        self.num_data = 12
        self.num_ancillas = 6
        self.reset()

        # Define stabilizers: list of data qubit indices per ancilla
        self.stabilizers = [
            [0, 1, 2],    # Ancilla 12
            [2, 3, 4],    # Ancilla 13
            [1, 4, 5],    # Ancilla 14
            [5, 6, 7],    # Ancilla 15
            [7, 8, 9],    # Ancilla 16
            [9, 10, 11]   # Ancilla 17
        ]

        # Ancilla qubit numbers
        self.ancillas = [12, 13, 14, 15, 16, 17]

    def reset(self):
        # Initialize Stim circuit for first round
        self.circuit = stim.Circuit(f"""
            H 0 1 2 3 4 5 6 7 8 9 10 11
        """)
        self.state = np.zeros(self.num_ancillas)
        return self.state

    def step(self, action):
        """
        action: binary vector of length num_ancillas indicating which stabilizers to measure
        """
        # Build circuit dynamically based on action
        for i, measure in enumerate(action):
            if measure:
                data_qubits = self.stabilizers[i]
                ancilla = self.ancillas[i]

                # Apply CX gates from data qubits to ancilla
                for q in data_qubits:
                    self.circuit.append_operation("CX", [q, ancilla])

                # Measure ancilla
                self.circuit.append_operation("M", [ancilla])

                # Detector referencing the most recent measurement
                self.circuit.append_operation("DETECTOR", ["rec[-1]"])

        # Apply Z-biased noise on all data qubits
        self.circuit.append_operation("Z_ERROR", list(range(self.num_data)), [self.p])  # <-- FIXED

        # Run simulator for this round
        sampler = self.circuit.compile_detector_sampler()
        det = sampler.sample(1)  # single-shot
        # Only last round of detectors
        self.state = det[0, -self.num_ancillas:]
        # update state

        # Reward: +1 for no detector triggered, -0.1 per stabilizer measured
        reward = 1 - 0.1*np.sum(action) - np.any(det)

        done = False  # could define episode length
        return self.state, reward, done



# --- RL skeleton ---
env = QLDPCEnv(p=0.05)
state = env.reset()

for t in range(10):  # 10 cycles
    # Example: random policy
    action = np.random.choice([0,1], size=env.num_ancillas)
    next_state, reward, done = env.step(action)
    print(f"Step {t}, reward={reward}, detectors={next_state}")
    state = next_state


Step 0, reward=0.8, detectors=[False False]
Step 1, reward=-0.30000000000000004, detectors=[False  True False False False]
Step 2, reward=-0.30000000000000004, detectors=[False False False  True False  True]
Step 3, reward=-0.09999999999999998, detectors=[False  True False False False False]
Step 4, reward=-0.30000000000000004, detectors=[False False False False  True False]
Step 5, reward=-0.30000000000000004, detectors=[ True  True False False False  True]
Step 6, reward=-0.30000000000000004, detectors=[False False  True  True False False]
Step 7, reward=-0.4, detectors=[ True False  True False False False]
Step 8, reward=-0.09999999999999998, detectors=[False  True False False False False]
Step 9, reward=-0.09999999999999998, detectors=[False False False False False False]


In [12]:
import stim
import numpy as np

# --- Define environment ---
class QLDPCEnv:
    def __init__(self, p=0.05, max_cycles=10):
        self.p = p
        self.num_data = 12
        self.num_ancillas = 6
        self.max_cycles = max_cycles

        # Define stabilizers: list of data qubit indices per ancilla
        self.stabilizers = [
            [0, 1, 2],    # Ancilla 12
            [2, 3, 4],    # Ancilla 13
            [1, 4, 5],    # Ancilla 14
            [5, 6, 7],    # Ancilla 15
            [7, 8, 9],    # Ancilla 16
            [9, 10, 11]   # Ancilla 17
        ]

        # Ancilla qubit numbers
        self.ancillas = [12, 13, 14, 15, 16, 17]

        # Define logical X operator as X0 X3 X6 X9
        self.logical_x = [0, 3, 6, 9]

        self.reset()

    def reset(self):
        # Initialize Stim circuit for first cycle
        self.circuit = stim.Circuit()
        # Prepare all data qubits in |+> for X logical
        self.circuit.append_operation("H", list(range(self.num_data)))

        # No previous detector info
        self.state = np.zeros(self.num_ancillas, dtype=bool)
        self.current_cycle = 0
        self.done = False
        return self.state

    def step(self, action):
        """
        action: binary vector of length num_ancillas indicating which stabilizers to measure
        """
        if self.done:
            raise ValueError("Environment is done. Call reset() before next step.")

        # --- Stabilizer measurements ---
        for i, measure in enumerate(action):
            if measure:
                data_qubits = self.stabilizers[i]
                ancilla = self.ancillas[i]

                # CX gates from data qubits to ancilla
                for q in data_qubits:
                    self.circuit.append_operation("CX", [q, ancilla])

                # Measure ancilla and add detector
                self.circuit.append_operation("M", [ancilla])
                self.circuit.append_operation("DETECTOR", [stim.target_rec(-1)])

        # --- Z-biased noise on all data qubits ---
        for q in range(self.num_data):
            self.circuit.append_operation("Z_ERROR", [q], [self.p])

        # --- Logical X measurement ---
        self.circuit.append_operation("M", self.logical_x)

        # Reference last n measurement records for logical observable
        n = len(self.logical_x)
        obs_targets = [stim.target_rec(-n + i) for i in range(n)]
        self.circuit.append_operation("OBSERVABLE_INCLUDE", obs_targets)

        # --- Compile and sample ---
        sampler = self.circuit.compile_detector_sampler()
        sample = sampler.sample(1)  # shape: (shots, num_detectors + num_observables)

        num_detectors = sampler.num_detectors
        num_observables = sampler.num_observables

        # --- Last-cycle detector state ---
        if len(sample[0]) >= self.num_ancillas:
            self.state = sample[0, -self.num_ancillas:]
        else:
            self.state = np.zeros(self.num_ancillas, dtype=bool)

        # --- Reward: logical X unchanged minus stabilizer cost ---
        logical_flip = sample[0, num_detectors]  # True if logical flipped
        stabilizer_cost = 0.1 * np.sum(action)
        reward = 1.0 - stabilizer_cost - float(logical_flip)

        # --- Update cycle ---
        self.current_cycle += 1
        if self.current_cycle >= self.max_cycles:
            self.done = True

        return self.state, reward, self.done


# --- Example usage with random policy ---
env = QLDPCEnv(p=0.05, max_cycles=10)
state = env.reset()

for t in range(10):
    # Random stabilizer policy
    action = np.random.choice([0, 1], size=env.num_ancillas)
    next_state, reward, done = env.step(action)
    print(f"Step {t}, reward={reward:.3f}, detectors={next_state}")
    state = next_state
    if done:
        break


TypeError: compile_detector_sampler(): incompatible function arguments. The following argument types are supported:
    1. (self: stim._stim_polyfill.Circuit, *, seed: object = None) -> stim._stim_polyfill.CompiledDetectorSampler

Invoked with: stim.Circuit('''
    H 0 1 2 3 4 5 6 7 8 9 10 11
    CX 0 12 1 12 2 12
    M 12
    DETECTOR rec[-1]
    CX 7 16 8 16 9 16
    M 16
    DETECTOR rec[-1]
    CX 9 17 10 17 11 17
    M 17
    DETECTOR rec[-1]
    Z_ERROR(0.05) 0 1 2 3 4 5 6 7 8 9 10 11
    M 0 3 6 9
    OBSERVABLE_INCLUDE(0) rec[-4] rec[-3] rec[-2] rec[-1]
'''); kwargs: add_observables=True