In [18]:
%matplotlib inline


In [19]:
from trajectory_processor import data_initializer
from torch import dist
from critter_actor_critic_continuous import Agent
import numpy as np 


class Environment:
    """

    """
    def __init__(self, DATA_FILE_NAME, trajectory_length = 30, midpoints = 2, EPSILON_PERTURBATIONS = False):
        self.epsilon_pert = EPSILON_PERTURBATIONS
        self.data, self.index_lookup, self.subsequent, self.trajectory = data_initializer(DATA_FILE_NAME, trajectory_length = trajectory_length, midpoints = midpoints)
        if len(self.index_lookup) != len(self.data):
            print("Warning there are some conflicts in the data lookup")
            print("Attempting to correct this problem")
            raise("Just kidding, please edit the environment to fix this problem. I've removed it due to our data having nice qualities. Turns out I was wrong.")
            for x, y in self.data.items():
                y = tuple(y)
                if y in self.index_lookup:
                    if type(self.index_lookup[y]) is int:
                        self.index_lookup[y] = [self.index_lookup[y]]
                    self.index_lookup[y].append(x)
                else:
                    self.index_lookup[y] = x

    def make_start_state(self):
        """Sets implicit start_index and final_index for some number of trajectories"""
        self.start_index, self.final_index = self.trajectory()
        return self.data[self.start_index]

    def state_and_reward(self, current_state, picked_action):
        """
        Should take the current state and the action and return the new state and the reward.
        """
        #current_state, picked_action = current_state.detach().clone(), picked_action.detach().clone()
        lookup_able = tuple(current_state.detach().clone().numpy())
        index = self.index_lookup[lookup_able]
        next_index = self.subsequent[index]
        next_state = self.data[next_index]
        if next_index == self.final_index:
            returned_state = next_state
            done = True
        else:
            returned_state = next_state
            done = False
        #now calculate reward
        if self.epsilon_pert:
            guess = current_state + picked_action
        else:
            guess = picked_action
        reward = dist(guess, next_state)
        reward = reward.detach().numpy()
        return returned_state, np.log(1/(reward+1)), done



In [20]:
DATA_FILE_NAME = "trajectory_dict.pickle"

In [None]:
env = Environment(DATA_FILE_NAME)
agent = Agent(alpha=3e-3, beta=0.01, input_dims = 20)
n_games = 100000

fname = 'ACTOR_CRITIC_' + 'lunar_lander_' + \
        str(agent.alpha) + "_" + str(agent.beta) +\
        '_' + str(n_games) + 'games'

scores = []
for i in range(n_games):
    done = False
    observation = env.make_start_state()
    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done = env.state_and_reward(observation ,action)
        score += -1/np.exp(reward)-1
        agent.learn(observation, reward, observation_, done)
        observation = observation_
    if i%200 == 0:
        agent.update_critic_target()
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.1f' % score,
            'average score %.1f' % avg_score, "    avg single score: ", score/30)



episode  0 score -239.3 average score -239.3     avg single score:  -7.976591984430949
episode  1 score -218.5 average score -228.9     avg single score:  -7.281810919443767
episode  2 score -244.4 average score -234.1     avg single score:  -8.146700692176818
episode  3 score -343.7 average score -261.5     avg single score:  -11.455325778325399
episode  4 score -219.0 average score -253.0     avg single score:  -7.300124406814575
episode  5 score -386.4 average score -275.2     avg single score:  -12.879901425043743
episode  6 score -326.5 average score -282.5     avg single score:  -10.881698870658875
episode  7 score -411.9 average score -298.7     avg single score:  -13.729179509480794
episode  8 score -231.8 average score -291.3     avg single score:  -7.728133074442545
episode  9 score -369.8 average score -299.1     avg single score:  -12.325749079386393
episode  10 score -346.5 average score -303.4     avg single score:  -11.551551119486492
episode  11 score -323.8 average sco

episode  93 score -363.9 average score -276.2     avg single score:  -12.129809625943501
episode  94 score -223.4 average score -275.7     avg single score:  -7.446920522054037
episode  95 score -298.0 average score -275.9     avg single score:  -9.93387790520986
episode  96 score -190.7 average score -275.0     avg single score:  -6.355033270517985
episode  97 score -194.6 average score -274.2     avg single score:  -6.485765854517619
episode  98 score -204.5 average score -273.5     avg single score:  -6.815554300944011
episode  99 score -193.8 average score -272.7     avg single score:  -6.459628280003866
episode  100 score -202.4 average score -272.3     avg single score:  -6.7470903396606445
episode  101 score -317.3 average score -273.3     avg single score:  -10.575813857714335
episode  102 score -195.6 average score -272.8     avg single score:  -6.519190224011739
episode  103 score -279.7 average score -272.2     avg single score:  -9.322274422645568
episode  104 score -305.2 

In [None]:
import matplotlib.pyplot as plt

x = [i+1 for i in range(n_games-100)]
plt.plot(scores)
plt.show()

In [None]:
y1 = scores
y2 = []
mean = lambda x: sum(x)/len(x)
for i in range(len(y1)):
    if i < 50:
        pass
    else:
        avg = mean(y1[(i-50):i])
        y2.append(avg)

In [None]:
plt.plot(y2)
plt.show()