In [1]:
import os
import pandas as pd
import numpy as np
import random
import timeit
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
import mdptoolbox, mdptoolbox.example

In [2]:
import os
import sys
from MDPModified import PolicyIteration, ValueIteration, QLearning

In [3]:
#write policy and reward to excel using pd
def writeDataFrame(R,policy,value,state,gamma,plot):
    cum_R = 0 
    states = []
    actions = []
    reward = []
    cums = []
    utility=[]
    uti=0
    for i in range(len(policy)):
        j = int(policy[i])
        rwd = R[j][i]
        cum_R += R[j][i]
     
        uti += pow(gamma,i)*rwd
        #print(i,uti)
        states.append(i)
        actions.append(j)
        reward.append(rwd)
        cums.append(cum_R)
        utility.append(uti)
        
    #print("total reward ",cum_R)  
    df = pd.DataFrame(index = states, 
                      data={'Reward': reward,'Cumulative reward': cums,
                            'Policy':policy,'REWARD(ACTION=0)':R[0],
                            'REWARD(ACTION=1)':R[1],'VALUE':value,'Utility':utility})
    #print(df)
    file_location = "State_" + str(state)+"_"+plot +".csv"
    export_csv = df.to_csv (file_location, index = None, header=True) 
    return cum_R

In [4]:
# Policy Iteration 
def policy_iteration_states():
    states = [10,100,400]
    gamma = 0.9
    avg_val=[]
    time =[]
    iters = []
    cum_rewards = []
    st=[]
    vi_iters=[]

    
    for s in states:  
        st.append(s)
        P, R = mdptoolbox.example.forest(S=s, r1=100, r2=50, p=0.1)
        p = np.ones([s])    
        pi = PolicyIteration(P, R, gamma,eval_type=1,epsilon = 0.0001,max_iter=1000)
        runs, delta = pi.run()
        #print(pi.policy)
        #avg_val.append(np.sum(pi.V)/s) #?

        time.append(pi.time) #policy time taken
        iters.append(pi.iter)
        cum_rewards.append(writeDataFrame(pi.R,pi.policy,pi.V,s,gamma,'policy'))
        vi_iters.append(runs[len(runs)-1])
        df1 = pd.DataFrame(delta)
        export_csv = df1.to_csv (r'Policy_delta_'+str(s)+'.csv')
        
    
    df = pd.DataFrame(index = st, 
                      data={'State':st,'Cumulative reward': cum_rewards,
                           'Iteration':iters,'Time':time,'Vi Iters':vi_iters})
    export_csv = df.to_csv (r'PolicyIteration.csv', index = None, header=True) 
    print(df)

In [5]:
policy_iteration_states()

Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  144
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyI

In [6]:
def value_iteration():
    states = [10,100,400]
    gamma = 0.9
    avg_val=[]
    time =[]
    iters = []
    cum_rewards = []
    st=[]
    #vi_iters=[]

    
    for s in states:  
        st.append(s)
        P, R = mdptoolbox.example.forest(S=s, r1=100, r2=50, p=0.1)
        p = np.ones([s])    
        vi = ValueIteration(P, R, gamma,epsilon = 0.0001,max_iter=1000)
        runs = vi.run()
        #print(vi.V)

        time.append(vi.time) #policy time taken
        iters.append(vi.iter)
        cum_rewards.append(writeDataFrame(vi.R,vi.policy,vi.V,s,gamma,'value'))
        #vi_iters.append(runs[len(runs)-1])
        df1 = pd.DataFrame(runs)
        export_csv = df1.to_csv (r'Value_delta_'+str(s)+'.csv')
        
    
    df = pd.DataFrame(index = st, 
                      data={'State':st,'Cumulative reward': cum_rewards,
                           'Iteration':iters,'Time':time})
    export_csv = df.to_csv (r'ValueIteration.csv',index = None, header=True) 
    print(df)

In [7]:
value_iteration()

     State  Cumulative reward  Iteration      Time
10      10              100.0         11  0.000996
100    100              174.0         76  0.003989
400    400              474.0         76  0.004986


In [8]:
def QLearning_data(l= 0.2, e=0.9):
    states = [10,100,400]
    gamma = 0.9
    avg_val=[]
    time =[]
    iters = []
    cum_rewards = []
    st=[]
    #vi_iters=[]

    
    for s in states:  
        st.append(s)
        P, R = mdptoolbox.example.forest(S=s, r1=100, r2=50, p=0.1)
        p = np.ones([s])    
        qi = QLearning(P, R, gamma,lr = l,epsilon = e)
        runs = qi.run()
        #print(qi.mean_discrepancy)
        print(qi.Q)
        print(qi.V)
        print(qi.policy)
        
        vi = ValueIteration(P, R, gamma,epsilon = 0.0001,max_iter=1000) #we need vi since qi does not have rewards
        run_vi = vi.run()
        time.append(qi.time) #policy time taken
        cum_rewards.append(writeDataFrame(vi.R,qi.policy,qi.V,s,gamma,'QLearning'))
        #vi_iters.append(runs[len(runs)-1])
        df1 = pd.DataFrame(runs)
        export_csv = df1.to_csv (r'QLearning'+str(s)+'.csv')
        
        
    
    df = pd.DataFrame(index = st,data={'State':st,'Cumulative reward': cum_rewards,
                           'Time':time})
    export_csv = df.to_csv (r'QLearning.csv', index = None, header=True) 
    print(df)

In [9]:
QLearning_data()

[[  4.55409734   4.08390889]
 [  4.51813824   5.07415797]
 [  4.51266723   5.05667825]
 [  4.52201068   5.04037273]
 [  4.50413472   5.06880014]
 [  4.48211777   5.07788984]
 [  4.93301086   5.06343483]
 [ 18.9343435    4.76324718]
 [142.46285231   3.83775982]
 [278.69606263  52.84220776]]
(4.554097343407033, 5.074157970728546, 5.056678248668082, 5.040372732756568, 5.068800140994976, 5.0778898447756315, 5.063434827454406, 18.93434349536585, 142.46285231218593, 278.6960626251855)
(0, 1, 1, 1, 1, 1, 1, 0, 0, 0)
[[ 4.45120242  4.01816948]
 [ 4.46853565  5.01375577]
 [ 4.50725997  5.0116931 ]
 [ 4.521041    5.01900625]
 [ 4.46215529  5.03503372]
 [ 4.33187578  5.02967607]
 [ 2.02699993  4.92216294]
 [ 2.29928203  3.39616431]
 [ 0.          4.19190873]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          1.02521117]
 [ 0.          0.        ]
 [ 0.18130494  0.        ]
 [ 0.17874711  1.84139329]
 [ 0.          0.99303952]
 [ 0.18081714  0.        ]
 [ 0.          2.46212026]

In [10]:
QLearning_data(0.1,0.9)

[[ 12.24182506  10.41137444]
 [ 13.99278119  11.3557388 ]
 [ 16.33562935  11.28370964]
 [ 19.67980724  10.34976565]
 [ 27.15606366   9.17390066]
 [ 34.7208176    8.37963856]
 [ 66.2779863    5.57355732]
 [112.84923967   5.42832536]
 [186.0070359    5.53412399]
 [294.3935155   49.18439947]]
(12.24182505781392, 13.992781192179727, 16.335629345705556, 19.67980724448962, 27.1560636605293, 34.720817597079936, 66.27798630260476, 112.84923967362388, 186.0070358972907, 294.39351550212075)
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
[[4.52566681e+00 4.08261426e+00]
 [4.52852577e+00 5.07943430e+00]
 [4.55000478e+00 5.07879897e+00]
 [4.40962932e+00 5.07626794e+00]
 [4.27135998e+00 5.06946662e+00]
 [2.71033946e+00 4.94497478e+00]
 [1.62866819e-01 4.04965995e+00]
 [4.45352721e-01 1.37012059e+00]
 [8.60168851e-02 9.54968781e-01]
 [0.00000000e+00 1.73775671e+00]
 [0.00000000e+00 4.96888981e-01]
 [0.00000000e+00 0.00000000e+00]
 [1.26981328e-01 0.00000000e+00]
 [4.51735902e-02 9.58239185e-01]
 [0.00000000e+00 9.59

In [11]:
QLearning_data(0.1,0.5)

[[  4.43808524   3.9763538 ]
 [  4.47411296   4.98501294]
 [  4.48458063   4.98186707]
 [  4.52823906   4.9945341 ]
 [  4.41249031   5.03281021]
 [  3.60744777   4.97865661]
 [  5.32412191   4.58035353]
 [ 17.24163879   3.28345399]
 [ 65.19672067   3.68055987]
 [184.77083559  44.91464116]]
(4.438085243999019, 4.985012944318534, 4.981867074972558, 4.994534097960087, 5.0328102108512285, 4.97865661198034, 5.324121910460719, 17.241638789524185, 65.19672066729845, 184.77083559223723)
(0, 1, 1, 1, 1, 1, 0, 0, 0, 0)
[[4.50681898e+00 4.03677395e+00]
 [4.46367071e+00 5.03271531e+00]
 [4.47722450e+00 5.01868165e+00]
 [4.51329905e+00 5.00864584e+00]
 [4.32307491e+00 5.04164834e+00]
 [2.35803286e+00 4.93663168e+00]
 [1.62675569e+00 3.75402268e+00]
 [5.56022376e-01 2.05624600e+00]
 [4.10406435e-01 1.35312005e+00]
 [4.52906030e-02 5.02225551e-01]
 [3.27038756e-03 9.61421419e-01]
 [1.55316523e-01 0.00000000e+00]
 [3.63376396e-01 9.49746088e-01]
 [0.00000000e+00 5.08023157e-01]
 [0.00000000e+00 0.0000

In [12]:
QLearning_data(0.1,0.1)

[[ 4.41139382  4.00478618]
 [ 4.50112074  5.00967983]
 [ 4.45656716  5.00404667]
 [ 4.49036103  5.01118441]
 [ 4.43993451  5.00754538]
 [ 3.68906227  4.98746558]
 [ 4.17830046  4.6200339 ]
 [10.54617104  3.71116468]
 [26.60186639  3.071589  ]
 [85.95755878 44.9425481 ]]
(4.4113938243407365, 5.009679832421714, 5.004046670792694, 5.011184414721964, 5.0075453826607195, 4.9874655781418324, 4.620033904568747, 10.54617104104386, 26.601866393688493, 85.95755877883508)
(0, 1, 1, 1, 1, 1, 1, 0, 0, 0)
[[4.64687942e+00 4.17487643e+00]
 [4.55542335e+00 5.17022664e+00]
 [4.48686522e+00 5.15876230e+00]
 [4.54340654e+00 5.08221238e+00]
 [4.36922505e+00 5.09437166e+00]
 [2.00325503e+00 5.02665277e+00]
 [1.54397480e+00 3.28218315e+00]
 [3.59891633e-01 2.59564485e+00]
 [4.42996787e-01 1.70073439e+00]
 [0.00000000e+00 1.06123415e+00]
 [0.00000000e+00 4.97127522e-01]
 [0.00000000e+00 4.97134740e-01]
 [0.00000000e+00 0.00000000e+00]
 [4.05896091e-03 0.00000000e+00]
 [4.88754084e-01 0.00000000e+00]
 [0.0000

In [13]:
QLearning_data(0.1,0.1)
QLearning_data(0.9,0.1)

[[  4.48566736   4.00937804]
 [  4.41699942   5.01687871]
 [  4.53899348   5.02047508]
 [  4.51391349   5.04977522]
 [  4.22495937   5.04019247]
 [  5.17018688   4.78002355]
 [ 17.26314957   4.63503586]
 [ 44.18776825   3.00319901]
 [ 97.59115315   3.32364071]
 [194.23589327  49.59284482]]
(4.485667361200564, 5.0168787129189205, 5.0204750842545875, 5.049775224123192, 5.040192467134279, 5.170186878067842, 17.26314957086893, 44.187768253037376, 97.5911531481812, 194.23589327256792)
(0, 1, 1, 1, 1, 0, 0, 0, 0, 0)
[[4.56794299e+00 4.09544069e+00]
 [4.52725532e+00 5.08931144e+00]
 [4.51638999e+00 5.08436057e+00]
 [4.47511750e+00 5.06303487e+00]
 [4.24919070e+00 5.04314601e+00]
 [3.48922794e+00 4.89869827e+00]
 [1.53899602e+00 4.56306106e+00]
 [1.63964327e-01 2.06702346e+00]
 [4.10057106e-01 9.58855712e-01]
 [0.00000000e+00 9.62141884e-01]
 [0.00000000e+00 0.00000000e+00]
 [4.09858152e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 4.70953144e-01]
 [0.00000000e+00 0.000

[[ 39.6858187   35.15693666]
 [ 33.01522286  35.32995735]
 [ 36.55836311  30.83718066]
 [ 54.62897392  35.17662198]
 [ 60.40127249  34.03172497]
 [125.99031284  26.65369756]
 [154.40920709  31.26283806]
 [141.86415401  61.4916594 ]
 [149.09179953  28.84819057]
 [240.21369446  85.76098307]]
(39.685818700906744, 35.329957352849725, 36.55836310555881, 54.62897391711917, 60.40127248726188, 125.99031284058124, 154.40920709149339, 141.86415400938296, 149.0917995346378, 240.2136944623193)
(0, 1, 0, 0, 0, 0, 0, 0, 0, 0)
[[ 4.73108425  4.23751312]
 [ 4.67776505  5.25690027]
 [ 4.71917539  5.24446414]
 [ 4.73041093  5.25395342]
 [ 4.73183789  5.25234469]
 [ 4.69553698  5.25779835]
 [ 4.30253221  5.21992293]
 [ 4.15273756  5.25779755]
 [ 4.24470954  0.        ]
 [ 3.72919733  5.2237645 ]
 [ 4.22028504  4.69497139]
 [ 4.21066612  5.20418971]
 [ 3.82652138  5.2475436 ]
 [ 3.82072432  4.72410048]
 [ 3.81840815  5.18399936]
 [ 0.          5.19591678]
 [ 0.          0.        ]
 [ 0.          4.733654

In [14]:
def QLearning_QTable(l, e):
    states = [10,100,400]
    gamma = 0.9



    
    for s in states:  
        st.append(s)
        P, R = mdptoolbox.example.forest(S=s, r1=100, r2=50, p=0.1)
        p = np.ones([s])    
        qi = QLearning(P, R, gamma,lr = l,epsilon = e)
        runs = qi.run()
       
        vi = ValueIteration(P, R, gamma,epsilon = 0.0001,max_iter=1000) #we need vi since qi does not have rewards
        run_vi = vi.run()


        df1=pd.DataFrame(qi.V)
        export_csv = df1.to_excel (r'V=0.1e=0.4_Exp'+str(experiment)+"_"+str(s)+'.xlsx')
   
        

In [15]:
QLearning_data(0.1,0.4)

[[  4.50481674   4.03563065]
 [  4.50777726   5.02263924]
 [  4.43913247   5.02553862]
 [  4.44461983   5.004326  ]
 [  4.47998455   5.00764102]
 [  5.0685455    5.00792858]
 [  8.91405772   4.5656152 ]
 [ 28.34986282   3.06273851]
 [ 83.04538933   4.27208037]
 [232.84606403  46.53428669]]
(4.5048167417853024, 5.022639239548159, 5.02553862229547, 5.004325997650926, 5.0076410178711175, 5.068545495934581, 8.914057723149408, 28.3498628156104, 83.04538932705954, 232.8460640286375)
(0, 1, 1, 1, 1, 0, 0, 0, 0, 0)
[[4.46545569e+00 3.99704251e+00]
 [4.40302423e+00 5.01936993e+00]
 [4.42644133e+00 5.02661935e+00]
 [4.48355080e+00 5.02513602e+00]
 [4.22630894e+00 5.02739288e+00]
 [3.13407190e+00 4.88839534e+00]
 [1.29979910e+00 3.90429569e+00]
 [5.42030165e-01 2.16197331e+00]
 [1.81455345e-01 2.62978383e+00]
 [1.88744224e-01 1.25794845e+00]
 [0.00000000e+00 1.17942839e+00]
 [4.09642783e-01 0.00000000e+00]
 [4.53738188e-02 0.00000000e+00]
 [4.06912176e-01 9.62756493e-01]
 [0.00000000e+00 0.000000

In [16]:
def compare_all():
    states = [10,100,400]
    gamma = 0.9
    avg_val=[]
   

    cum_rewards = []
    st=[]
    time_pi =[]
    time_vi =[]
    time_qi =[]

    iters = []

    #vi_iters=[]

    
    for s in states:  
       
        st.append(s)
        P, R = mdptoolbox.example.forest(S=s, r1=100, r2=50, p=0.1)
        p = np.ones([s])    
        vi = ValueIteration(P, R, gamma,epsilon = 0.0001,max_iter=1000)
        pi = PolicyIteration(P, R, gamma,eval_type=1,epsilon = 0.0001,max_iter=1000)
        qi = QLearning(P, R, gamma,lr = 0.1,epsilon = 0.4)
        run_vi=vi.run()
        run_pi,deltaPolicy=pi.run()
        run_qi=qi.run()
        time_vi.append(vi.time) #policy time taken
        time_pi.append(pi.time) #policy time taken
        time_qi.append(qi.time) #policy time taken


    df = pd.DataFrame(data={'State':st,'Time Policy':time_pi,'Time Value':time_vi,'Time QLearning':time_qi})
    export_csv = df.to_excel (r'ComparTime'+str(s)+'.xlsx',index = None, header=True) 

In [17]:
compare_all()

Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  144
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyIterative iterations:  103
Converged after _evalPolicyI