In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit
%matplotlib inline

In [None]:
df = pd.read_csv("medium.csv")

In [None]:
class TDQ_Medium():
    
    """
    function approximation using temporal difference Q learning
    """
    
    def __init__(self, nactions, nstates, alpha, gamma,data,episode_length, ld):
        self.Q = np.zeros((nstates, nactions)) #initial Q-value
        self.z = np.zeros((nstates, nactions)) #initial eligibility trace
        self.nactions = nactions
        self.nstates = nstates
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount
        self.ld = ld # lambda
        self.data = data # input data containing all observed states
        self.episode_length = episode_length #length of each episode in the data

  
    def value(self, state, action):
        """
        returns Q-value for a given state and action
        """
        
        return self.Q[state, action]

   
    def delta(self, pstate, paction, reward, state, action):
        """
        Delta function used to calculate the main error correlation bewteen the previous Q state and 
        the current Q state
        
        """
        return reward + (self.gamma * self.value(state,action)) - self.value(pstate,paction)

    
    def train(self, pstate, paction, reward, state, action, delta = None):
        """
        A single step of reinforcement learning.
        """

        if delta is None:
            delta = self.delta(pstate, paction, reward, state, action)
        
        self.z[pstate,paction] += 1.0 # replacing traces
        
        # udpate Q and z
        self.Q += self.alpha * delta * self.z
        self.z *= (self.gamma * self.ld)

   
    def learn(self, verbose = True):
        """
        main function for iterating over the episodes in the data
        """
        
        #length of episodes in the data
        episode_length = self.episode_length
        
        # for-loop to iterate over all episdoes
        for i in range(int(len(self.data)/episode_length)):
            
            t = []
            episode_done = False
            episode = 0
            
            # while loop to iterate over all data in the episode
            while not episode_done:
                
                
                s = self.data[i*499+episode,0]
                a = self.data[i*499+episode,1]-1
                r = self.data[i*499+episode,2]
                sp = self.data[i*499+episode,3]
                na = self.data[i*499+episode+1,1]-1
                t.append([s,a,r,sp,na])
                             
                episode+= 1
                if episode == 497:
                    episode_done = True
            
            # update Q-value
            for (previous, paction, reward, state, action) in t:
                    self.train(previous, paction, reward, state, action)
            if verbose:
                print (i)

In [None]:
data = df.values.astype(int)

In [None]:
td = TDQ(7,50000,0.2,1,data,499,0.99)

In [None]:
td.learn()