In [1]:
# -*- coding: utf-8 -*-
"""
Created on Thu Jul  2 17:52:55 2020

@author: lusun
"""
import numpy as np
import time
from scipy.optimize import minimize, Bounds, LinearConstraint

from scipy.special import logsumexp,softmax
#from scipy import interpolate
import warnings
warnings.filterwarnings('ignore')

class DynamicFunction:
    def __init__(self, transMat, obsMat,obsTrueMat, Reward):
        self.aDim = 3
        self.sDim = 2
        self.zDim = 2
        self.bs = 21
        self.xSpace = np.linspace(0,1,self.bs)
        self.error = 1e-6
        self.beta = 0.85
        self.rho = 1
        self.transMat = transMat
        self.obsMat = obsMat
        self.obsTrueMat = obsTrueMat
        self.thetaOne = Reward
        
    def funcReward(self,x,action):
        belief = [x,1-x]
        return np.dot(belief,self.thetaOne[:,action])
    def dynamicMat(self,observationMat):
        dynamics = np.zeros([2,3,2,2])
        for s in range(2):
            for a in range(3):
                for ss in range(2):
                    for z in range(2):
                        dynamics[s,a,ss,z] = self.transMat[a, s, ss] * observationMat[ss, z]
        return dynamics

    def funcLambdaSigma(self,x,action,observation,agent):
        belief = [x,1-x]
        if agent==1:
            thetaTwo = self.dynamicMat(self.obsTrueMat)
        else:
            thetaTwo = self.dynamicMat(self.obsMat)
        lambdTep = np.dot(belief,thetaTwo[:,action,:,observation])
        sigma = sum(lambdTep)
        return lambdTep/sigma,sigma
    
    def funcQstar(self,agent):
        Q0 = np.array([[self.funcReward(x,action)for action in range(self.aDim)]for x in self.xSpace])
        Qold = np.zeros(np.shape(Q0))
        
        Qnew = Q0
        
        index = 0
        while (abs(Qnew-Qold).max() >self.error):
            Qold = Qnew
            Qnew = np.zeros(np.shape(Qold))
            for i,x in enumerate(self.xSpace):
                for a in range(self.aDim):
                    for z in range(self.zDim):
                        lambd,sigma = self.funcLambdaSigma(x,a,z,agent)
                        x_index = np.argmin(abs(self.xSpace-lambd[0]))
                        Qnew[i,a] += sigma * logsumexp(Qold[x_index,:]/self.rho)
                    Qnew[i,a] = Q0[i,a] +self.beta *self.rho *Qnew[i,a]
            index += 1
#        self.Qstar = Qnew
        return Qnew,index
    
    def funcPax(self,x,Qstar):
        x_index = np.argmin(abs(self.xSpace-x))
#        Qstar,_ = self.funcQstar()
        return softmax(Qstar[x_index,:]/self.rho)
    
    def GenerateData(self,beliefSize,sampleSize,timeLength,sampleRatio):
        QStar1,_ = self.funcQstar(1)
        QStar2,_ = self.funcQstar(2)

        def OnePath(x0,agent):
            xTep = []
            sTep = []
            aTep = []
            zTep = []
            Prob2Dim = self.sDim*self.zDim
            xTep.append(x0)
            
            s0 = np.random.choice(np.arange(self.sDim),p=[x0,1-x0])
            sTep.append(s0)
            
            thetaTwo = self.dynamicMat(self.obsTrueMat)
            for tl in range(timeLength):
                xt = xTep[-1]
                if agent == 1:
                    prob1 = self.funcPax(xt,QStar1)
                else:
                    prob1 = self.funcPax(xt,QStar2)
                at = np.random.choice(np.arange(self.aDim),p=prob1)
                aTep.append(at)
                
                st = sTep[-1]
                prob2 = thetaTwo[st,at,:,:]
                prob2 = np.reshape(prob2,[1,-1])[0]
                res = np.random.choice(np.arange(Prob2Dim),p=prob2)
                sT = res//self.zDim
                zT = res%self.zDim
                
                sTep.append(sT)
                zTep.append(zT)
                
                xTtep,_ = self.funcLambdaSigma(xt,at,zT,agent)
                xT = xTtep[0]
                xTep.append(xT)
            return xTep,sTep,aTep,zTep
        
        totalSample = beliefSize * sampleSize
        x0Space = np.linspace(0,1,beliefSize)
        beliefIni = [ele for ele in x0Space for i in range(sampleSize)]
        
        agent1Samp = int (totalSample * sampleRatio)
        agent2Samp = int (totalSample-agent1Samp)

        xData1 = np.zeros([agent1Samp,timeLength+1])
        sData1 = np.zeros([agent1Samp,timeLength+1],dtype = int)
        aData1 = np.zeros([agent1Samp,timeLength],dtype = int)
        zData1 = np.zeros([agent1Samp,timeLength],dtype = int)

        xData2 = np.zeros([agent2Samp,timeLength+1])
        sData2 = np.zeros([agent2Samp,timeLength+1],dtype = int)
        aData2 = np.zeros([agent2Samp,timeLength],dtype = int)
        zData2 = np.zeros([agent2Samp,timeLength],dtype = int)

        xData1[:,0] = beliefIni[0:agent1Samp]
        xData2[:,0] = beliefIni[agent1Samp:totalSample]
        #sData[:,0] = np.random.choice(np.arange(self.sDim),(totalSample), p=[1/2,1/2])
        for ss in range(agent1Samp):
            x0ss = beliefIni[ss]
            xSS,sSS,aSS,zSS = OnePath(x0ss,1)
            xData1[ss,:] = xSS
            sData1[ss,:] = sSS
            aData1[ss,:] = aSS
            zData1[ss,:] = zSS

        for ss in range(agent2Samp):
            x0ss = beliefIni[agent1Samp+ss]
            xSS,sSS,aSS,zSS = OnePath(x0ss,2)
            xData2[ss,:] = xSS
            sData2[ss,:] = sSS
            aData2[ss,:] = aSS
            zData2[ss,:] = zSS

        return beliefIni,aData1,zData1,xData1,sData1,aData2,zData2,xData2,sData2

##############################RewardFunction########################################
def LoglikeReward(thetaOne,transMat,obsMat,obsTrueMat,beliefIni,aData1,zData1,aData2,zData2,ratio):
    aDim = 3
    sDim = 2
    zDim = 2
    totalSample = len(beliefIni)
    
    agent1Samp = len(aData1)
    beliefIni1 = beliefIni[0:agent1Samp]

    agent2Samp = len(aData2)
    beliefIni2 = beliefIni[agent1Samp:totalSample]

    timeLength = len(aData1[0])

    #ProbMatrix = np.reshape(thetaTwoTrue,[sDim,aDim,sDim,zDim])
    reward = np.reshape(thetaOne,[sDim,aDim])
    
    agent = DynamicFunction(transMat, obsMat,obsTrueMat, reward)
    QStar1,_ = agent.funcQstar(1)
    QStar2,_ = agent.funcQstar(2)
    loglike = 0
    for ss in range(agent1Samp):
        xOld = beliefIni1[ss]
        for tl in range(timeLength):
            aOld = aData1[ss,tl]
            pax = agent.funcPax(xOld,QStar1)[aOld]
            
            xTep,_ = agent.funcLambdaSigma(xOld,aOld,zData1[ss,tl],1)
            xOld = xTep[0]           
            loglike += np.log(pax)

    for ss in range(agent2Samp):
        xOld = beliefIni2[ss]
        for tl in range(timeLength):
            aOld = aData2[ss,tl]
            pax = agent.funcPax(xOld,QStar2)[aOld]
            
            xTep,_ = agent.funcLambdaSigma(xOld,aOld,zData2[ss,tl],2)
            xOld = xTep[0]           
            loglike += np.log(pax)            
    return -loglike

def EstimatorRd(transMat,obsMat,obsTrueMat,beliefIni,aData1,zData1,aData2,zData2,ratio):
    aDim = 3
    sDim = 2
 #   zDim = 2
    
#    ObjectFunction = lambda thetaOne : LoglikeReward(thetaOne,transMat,obsMat,obsTrueMat,beliefIni,aData1,zData1,aData2,zData2,ratio)
    ObjectFunction = lambda thetaOne : LoglikeReward(np.array([thetaOne[0],4,thetaOne[1],thetaOne[2],0,thetaOne[3]]),
                                                     transMat,obsMat,obsTrueMat,beliefIni,aData1,zData1,aData2,zData2,ratio)
    
    nDim = sDim *aDim
#    x0 = [1e-06 for i in range(nDim)]
    x0 = [1e-06 for i in range(nDim-2)]
    
#    bounds = [(0, None ) for i in range(nDim)]
    bounds = [(0, None ) for i in range(nDim-2)]

    res1 = minimize(ObjectFunction,#dynamic_initializer,
                    x0=x0,
                    bounds=bounds,
                    method = 'trust-constr',
                    options= {'verbose': 3})
#                       method='SLSQP',
#                       options={'disp': True})
#    return np.reshape(res1.x,[sDim,aDim]),res1.fun
    return np.reshape([res1.x[0],4,res1.x[1],res1.x[2],0,res1.x[3]],[sDim,aDim]),res1.fun

In [2]:
if __name__ == '__main__':
  
    reward = np.array([
        [7, 4, 0],
        [3, 0, 7]
    ])
    transition = np.array([
        [
            [0.8, 0.2],
            [0, 1]
        ],
        [
            [0.9, 0.1],
            [0, 1]
        ],
        [
            [1, 0],
            [0.4, 0.6]
        ]
    ])
    observation1 = np.array([
        [0.9, 0.1],
        [0.1, 0.9]
    ])
    observation2 = np.array([
        [0.7, 0.3],
        [0.3, 0.7]
    ])

    agent = DynamicFunction(transition ,observation2,observation1, reward)
    beliefIni,aData1,zData1,xData1,sData1,aData2,zData2,xData2,sData2= agent.GenerateData(11,10,10,0.7)#(beliefSize,sampleSize,timeLength)
    #print(samples)

In [3]:
t1 = time.time()
resReward,resTrueFun2 = EstimatorRd(transition,observation2,observation1,beliefIni,aData1,zData1,aData2,zData2,0.7)
elapsed_time = time.time() - t1
print(resReward)

| niter |f evals|CG iter|  obj func   |tr radius |   opt    |  c viol  | penalty  |barrier param|CG stop|
|-------|-------|-------|-------------|----------|----------|----------|----------|-------------|-------|
|   1   |   5   |   0   | +2.8942e+03 | 1.00e+00 | 2.62e+02 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   0   |
|   2   |  10   |   1   | +2.4686e+03 | 7.00e+00 | 2.87e+02 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   2   |
|   3   |  15   |   2   | +5.1871e+02 | 1.40e+01 | 1.00e+02 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   2   |
|   4   |  20   |   6   | +4.5888e+02 | 1.40e+01 | 1.46e+01 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   1   |
|   5   |  25   |  10   | +4.1747e+02 | 2.41e+01 | 4.66e+00 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   1   |
|   6   |  30   |  14   | +4.1626e+02 | 2.41e+01 | 2.13e+00 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   1   |
|   7   |  35   |  18   | +4.1572e+02 | 2.41e+01 | 1.35e+00 | 0.00e+00 | 1.00e+00 |  1.00e-01   |   1   |
|   8   |  40   |  21   | +4.1552e+02 | 2.41e+

In [None]:
t2 = time.time()
resReward,resTrueFun2 = EstimatorRd(transition,observation2,observation1,beliefIni,aData1,zData1,aData2,zData2,0.5)
elapsed_time = time.time() - t2

(420,)
420


In [None]:
resReward = np.reshape(np.array([resReward[0][0],4,resReward[0][1],resReward[1][0],0,resReward[1][2]]),[2,3])
print('True:',reward)    
print('Esti:',resReward)
print('Eorr:',(resReward-reward),'\n',np.linalg.norm(resReward-reward))
print('TrueObj:',trueFun2,'\n','EstiObj:',resTrueFun2)