## Cartpole game with different function approximators

In [None]:
# Load required modules

import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVC
from sklearn.linear_model import Ridge, SGDRegressor, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor

We have two agent models. One uses linear function approximators and the other uses neural network approximator for Q learning based RL problem. For linear appriximators, we used whole replay memory for training the agent. However for neural netowork based agent we used randm batch update from the replay memory.

### Linear model agent

In [None]:

class Q_Linear_approximator:
    def __init__(self, state_size, action_size, model):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=1000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = model
        self.isFit = False

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        if self.isFit == True:
            act_values = self.model.predict(state)
        else:
            act_values = np.zeros(self.action_size).reshape(1, -1)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size): # agent trained with stored samples in this function 
        minibatch = random.sample(self.memory, int(len(self.memory)))
        X=[]
        targets=[]
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                if self.isFit:
                    target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
                else:
                    target=reward
            if self.isFit:
                target_f = self.model.predict(state)
            else:
                target_f = np.zeros(self.action_size).reshape(1, -1)
            target_f[0][action] = target
            
            X.append(list(state[0]))
            targets.append(target_f[0])
        self.model.fit(X, targets)
        self.isFit=True
        # Epsilon decay method is user here in order to let agent to stick on learned actions
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

### Neural network model agent

In [None]:
class Q_NN_approximator:
    def __init__(self, state_size, action_size, model):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = model


    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size): # agent trained with stored samples in this function 
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
def NN_model(state_size, action_size):
    # Neural Net for Deep-Q learning Model
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse',optimizer=Adam(lr=0.001))
    return model

In [None]:
# main learning function

env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

#NN agent model is created and agent activated
NN_learning=NN_model(state_size, action_size)
agent = Q_NN_approximator(state_size, action_size, NN_learning)

#choose one of the models below and activate the agent
#model=MultiOutputRegressor(LGBMRegressor(n_estimators=100, n_jobs=-1))
#model=MultiOutputRegressor(XGBRegressor())
#model=MultiOutputRegressor(Ridge(alpha=0.1))
#model=MultiOutputRegressor(KNeighborsRegressor(n_neighbors=10))
#agent = Q_Linear_approximator(state_size, action_size, model)


filename="CartpoleQ_approximate.txt" #saeve the results to a text file
done = False
batch_size = 32
EPISODES = 100

for e in range(EPISODES):
#    env.render()
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
#        reward = reward if not done else -10
        reward = reward if not done else -reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, agent.epsilon))
#            with open(filename, "a") as f:
#                f.write("Simulation {}: Total score {}\n".format(e, time))
            break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

We discussed our results on our final report instead of here. Please refer to STA_208_Project_Report.pdf file in the main directory of repo.