In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# 1. Environment and Parameters
gamma = 0.95
num_samples = 1000
# States represented by [pos_x, pos_y, vel_x, vel_y]

# 2. Simulate Dynamics (Model of the MDP) 
def get_next_state(s, a):
    # s: current state, a: action
    # Simplified physics: new_pos = pos + vel, new_vel updated by action
    new_s = s.copy()
    new_s[0:2] += s[2:4] # update position
    # action 0: stay, 1: accel_x, 2: accel_y
    if a == 1: new_s[2] += 0.1
    if a == 2: new_s[3] += 0.1
    return new_s + np.random.normal(0, 0.01, size=4) # adding noise epsilon

# 3. Fitted Value Iteration 
def fitted_value_iteration(m_samples=100, iterations=10):
    # Sample random states
    states = np.random.uniform(-5, 5, size=(m_samples, 4))
    V_model = LinearRegression()
    # Initialize V(s) = 0
    y = np.zeros(m_samples)
    
    for _ in range(iterations):
        V_model.fit(states, y)
        
        new_y = []
        for i in range(m_samples):
            # Compute max_a (R + gamma * V(s_next))
            # R is simplified here (e.g., -dist to origin)
            reward = -np.linalg.norm(states[i, 0:2])
            
            # Sample next states for expectations [cite: 17, 18]
            next_vals = []
            for a in [0, 1, 2]:
                s_next = get_next_state(states[i], a)
                v_next = V_model.predict(s_next.reshape(1, -1))[0]
                next_vals.append(reward + gamma * v_next)
            
            new_y.append(max(next_vals))
        y = np.array(new_y)
    
    return V_model

# 4. Training
model = fitted_value_iteration(m_samples=200, iterations=5)
print("Fitted Value Function (Model Weights):", model.coef_)