In [3]:
import numpy as np
import numpy.random as npr

from SwingyMonkey import SwingyMonkey
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict

In [4]:
class QLearner(object):
    '''
    This agent jumps randomly.
    '''

    def __init__(self, alpha, gamma):
        self.last_state    = None
        self.last_action   = None
        self.last_reward   = None
        self.total         = 0 
        self.height = 400 
        self.alpha = alpha
        self.gamma = gamma
        self.Q                   = defaultdict(lambda: [0,0])
        self.state_action_counts = defaultdict(lambda: [0,0])
        self.iteration = 1

    def reset(self):
        self.iteration += 1
        self.last_state  = None
        self.last_action = None
        self.last_reward = None
    
    def convert_state(self, state):
        binsize = 50
        dist_thresh    = 50
        speed_thresh   = 0
        
        monkey_tree_top_dist = (state['monkey']['top'] - state['tree']['top']) / binsize   
        #monkey_tree_bot_dist = (state['monkey']['bot'] - state['tree']['bot'])/ binsize

        monkey_near_top      = (self.height - state['monkey']['top']) < dist_thresh
        monkey_near_bottom   = state['monkey']['bot']                 < dist_thresh
        monkey_near_tree     = state['tree']['dist']                  < dist_thresh
        monkey_fast          = state['monkey']['vel']                 > speed_thresh
        return (monkey_near_top, monkey_near_bottom, monkey_near_tree, monkey_fast, monkey_tree_top_dist)
    
    def update_Q(self, state, action):
        last_state_key    = self.convert_state(self.last_state)
        current_state_key = self.convert_state(state)

        max_q = max(self.Q[current_state_key])
        self.Q[last_state_key][action] += self.compute_q(last_state_key, action, max_q)
    
    def compute_q(self, state_key, action, max_q):
        return (self.alpha / self.state_action_counts[state_key][action]) \
                                                * (self.last_reward \
                                                + self.gamma * max_q \
                                                - self.Q[state_key][action])
        
        
    def action_callback(self, state):
        # first time we don't update
        if self.last_state:
            self.update_Q(state, self.last_action)
        state_key = self.convert_state(state)
        # ϵ Greedy
        # it is common to take ϵ = 1/t
        epsilon = 1.0  / self.iteration
        # take a random action with probability epsilon
        if npr.random() < epsilon:
            new_action = npr.randint(0, 1)
        else:
            max_q  = max(self.Q[state_key])
            new_action =  self.Q[state_key].index(max_q)
        self.state_action_counts[state_key][new_action] += 1

        self.last_action = new_action
        self.last_state  = state

        return self.last_action

    def reward_callback(self, reward):
        self.last_reward = reward
        self.total = self.total + reward

In [1]:
def run_games(learner, hist, iters = 50, t_len = 1):
    '''
    Driver function to simulate learning by having the agent play a sequence of games.
    '''
    
    for ii in range(iters):
        # Make a new monkey object.
        swing = SwingyMonkey(sound=False,                  # Don't play sounds.
                             text="Epoch %d" % (ii),       # Display the epoch on screen.
                             tick_length = t_len,          # Make game ticks super fast.
                             action_callback=learner.action_callback,
                             reward_callback=learner.reward_callback)

        # Loop until you hit something.
        while swing.game_loop():
            if swing.score == 50:
                break
        # Save score history.
        hist.append(swing.score)
        # Reset the state of the learner.
        learner.reset()
        
    return

In [2]:
def run(alpha, gamma):
    agent = QLearner(alpha, gamma)
    scores = []
    run_games(agent, scores) 
    np.save('scores' + '_' + str(alpha) + '_' + str(gamma), np.array(scores))
    return scores

In [65]:
scores = run(0.75, 0.75)

In [66]:
sum(scores)

102

### GridSearch

In [5]:
all_scores = dict()
alphas = [0.25, 0.5, 0.75]
gammas = [0.25, 0.5, 0.75]
for alpha in alphas:
    for gamma in gammas:
        scores = run(alpha, gamma)
        all_scores[(alpha, gamma)] = scores

In [6]:
import pandas as pd
score_df = pd.DataFrame(columns=['alpha','gamma','total','iter_top'])
for i,(k,v) in enumerate(all_scores.iteritems()):
    alpha = k[0]
    gamma = k[1]
    total =  sum(v)
    iter_top =  v.index(max(v))
    score_df.loc[i] = (alpha, gamma, total, iter_top)

In [7]:
score_df = score_df.sort_values('total', ascending=False)
score_df.to_csv('score_df2')

In [8]:
score_df

Unnamed: 0,alpha,gamma,total,iter_top
7,0.25,0.75,765,18
1,0.5,0.5,673,26
4,0.5,0.75,673,17
3,0.75,0.5,670,24
5,0.75,0.25,426,37
2,0.75,0.75,332,38
6,0.25,0.25,309,14
8,0.5,0.25,192,19
0,0.25,0.5,158,18
