<a href="https://colab.research.google.com/github/nosadchiy/public/blob/main/RustMDPQuadratic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
from scipy.optimize import minimize

##################################################
# MDP class with quadratic usage cost
##################################################

class ReplacementMDP:
    """
    A toy replacement model with a quadratic cost function:
      cost(s, a=0) = alpha_1 * s + alpha_2 * s^2
      cost(s, a=1) = cost_replace
    """
    def __init__(self,
                 S=5,                  # number of discrete states
                 beta=0.95,            # discount factor
                 cost_replace=10.0,    # replacement cost
                 alpha_1=1.0,          # linear component of usage cost
                 alpha_2=0.1           # quadratic component of usage cost
                 ):
        self.S = S
        self.beta = beta
        self.cost_replace = cost_replace
        self.alpha_1 = alpha_1
        self.alpha_2 = alpha_2

    def rewards(self, s, a):
        """
        Reward = negative of the cost.
        If a=0 (continue), cost = alpha_1 * s + alpha_2 * s^2
        If a=1 (replace), cost = cost_replace
        """
        if a == 0:
            cost = self.alpha_1 * s + self.alpha_2 * (s**2)
        else:
            cost = self.cost_replace

        return -cost  # reward = -cost

    def transition_probs(self, s, a):
        """
        If a=0 (continue), next state is min(s+1, S-1).
        If a=1 (replace), next state is 0.
        """
        next_state = np.zeros(self.S)
        if a == 0:
            next_s = min(s+1, self.S-1)
            next_state[next_s] = 1.0
        else:
            next_state[0] = 1.0
        return next_state

##################################################
# Value iteration for this new MDP
##################################################

def solve_value_function(mdp, tol=1e-8, max_iter=1000):
    S = mdp.S
    V = np.zeros(S)

    for _ in range(max_iter):
        V_old = V.copy()
        Q = np.zeros((S, 2))
        for s in range(S):
            for a in [0, 1]:
                r_sa = mdp.rewards(s, a)
                P_sa = mdp.transition_probs(s, a)
                Q[s, a] = r_sa + mdp.beta * np.sum(P_sa * V_old)
        V = np.max(Q, axis=1)
        if np.max(np.abs(V - V_old)) < tol:
            break
    return V, Q

def choice_probabilities(Q, mu=1.0):
    exp_Q = np.exp(Q / mu)
    denom = np.sum(exp_Q, axis=1, keepdims=True)
    return exp_Q / denom

##################################################
# Log-likelihood with the new cost structure
##################################################

def log_likelihood(theta, data, S=5, beta=0.95, mu=1.0):
    """
    theta = [cost_replace, alpha_1, alpha_2]
    """
    cost_replace, alpha_1, alpha_2 = theta

    # Simple checks to keep parameters in a reasonable range
    if cost_replace <= 0 or alpha_2 < 0:
        return 1e8

    # Construct MDP
    mdp = ReplacementMDP(S=S, beta=beta,
                         cost_replace=cost_replace,
                         alpha_1=alpha_1,
                         alpha_2=alpha_2)

    # Solve for Q
    _, Q = solve_value_function(mdp)

    # Choice probabilities
    P = choice_probabilities(Q, mu=mu)

    # Log-likelihood
    ll = 0.0
    for (s_obs, a_obs) in data:
        ll += np.log(P[s_obs, a_obs] + 1e-12)

    return -ll

##################################################
# Simulate data with a quadratic cost
##################################################

def simulate_data(mdp, n=1000, seed=42, mu=1.0):
    np.random.seed(seed)
    _, Q = solve_value_function(mdp)
    P = choice_probabilities(Q, mu=mu)

    data = []
    s = 0
    for _ in range(n):
        a = np.random.choice([0,1], p=P[s])
        data.append((s, a))
        next_s_probs = mdp.transition_probs(s, a)
        s = np.random.choice(mdp.S, p=next_s_probs)
    return data

##################################################
# Example: "true" parameters and estimation
##################################################

def estimate_parameters(data, S=5, beta=0.95, mu=1.0):
    def objective(theta):
        return log_likelihood(theta, data, S=S, beta=beta, mu=mu)

    # initial guess
    theta0 = np.array([8.0, 1.0, 0.05])  # [cost_replace, alpha_1, alpha_2]
    bnds = [(1e-3, None), (0.0, None), (0.0, None)]  # minimal bounds

    result = minimize(objective, theta0, method='L-BFGS-B', bounds=bnds)
    return result

if __name__ == "__main__":
    # "True" parameter values
    true_cost_replace = 12.0
    true_alpha_1 = 1
    true_alpha_2 = 0.2
    beta = 0.95
    mu = 1.0
    S=10

    # Build MDP with quadratic costs
    true_mdp = ReplacementMDP(S=S, beta=beta,
                              cost_replace=true_cost_replace,
                              alpha_1=true_alpha_1,
                              alpha_2=true_alpha_2)

    # Simulate data
    data = simulate_data(true_mdp, n=2000, seed=123, mu=mu)

    # Estimate
    est_result = estimate_parameters(data, S=S, beta=beta, mu=mu)
    print("Estimation results:")
    print("  success:", est_result.success)
    print("  estimated parameters (cost_replace, alpha_1, alpha_2):", est_result.x)
    print("  negative log-likelihood:", est_result.fun)
    print("  message:", est_result.message)


Estimation results:
  success: True
  estimated parameters (cost_replace, alpha_1, alpha_2): [14.46547548  1.88793709  0.02108606]
  negative log-likelihood: 553.6267512119392
  message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
