-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagent.py
67 lines (57 loc) · 1.87 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import pickle
import os.path
from policy import *
class Agent():
def __init__(self, number_of_frames, device, path=None):
self.UP = 119
self.NONE = None
self.Y_UP = 0
self.Y_NONE = 1
self.max_pipes = 0
self.model = self.load(path)
if self.model is None:
self.model = Policy(number_of_frames).to(device)
#self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2)
self.optimizer = optim.SGD(self.model.parameters(), lr=0.085, momentum=0.5)
def load(self, filename):
if not filename is None and os.path.exists(filename):
print("Loading model...")
infile = open(filename,'rb')
model = pickle.load(infile)
infile.close()
print("Model loaded :D!")
return model
return None
def save(self, filename):
print("Writing model to disk...")
outfile = open(filename,'wb')
pickle.dump(self.model, outfile)
outfile.close()
def pick_action(self, x):
x = torch.from_numpy(x).float()#.unsqueeze(0)
probs = self.model.forward(x)
m = Categorical(probs)
action = m.sample()
log_prob = m.log_prob(action)
prob = action.item()
#if np.random.uniform() > prob:
# action = 0
#else:
# action = 1
return prob, log_prob, action
def translate_o_to_action(self, o):
if o==0:
return self.UP
return self.NONE
def train(self, log_probs, rewards):
self.optimizer.zero_grad()
rewards_applied = -log_probs.float() * rewards.float()
loss = rewards_applied.sum()
loss.backward()
self.optimizer.step()
return loss