In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import gym
import torch
from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.nn import Linear
import cartpole_data
import random
%matplotlib inline

https://pythonprogramming.net/openai-cartpole-neural-network-example-machine-learning-tutorial/ 


In [4]:
goal_steps = 1000
score_requirement = 100
initial_games = 100000

In [5]:
training_data = cartpole_data.initial_population(goal_steps,score_requirement,initial_games)

Average accepted score: 109.21428571428571
Median score for accepted scores: 106.0
Counter({102.0: 8, 106.0: 6, 101.0: 6, 103.0: 4, 104.0: 4, 100.0: 3, 119.0: 3, 107.0: 2, 120.0: 2, 105.0: 2, 109.0: 2, 116.0: 2, 115.0: 1, 113.0: 1, 138.0: 1, 127.0: 1, 130.0: 1, 132.0: 1, 118.0: 1, 124.0: 1, 108.0: 1, 122.0: 1, 111.0: 1, 121.0: 1})


Split into training testing and validation data? 

In [6]:
class make_Dataset(Dataset):
    def __init__(self, mode):

        # read in trainig data

        observations = np.array([training_data[i][0] for i in range(len(training_data))])
        actions = np.array([training_data[i][1] for i in range(len(training_data))])

        observations = torch.tensor(observations , dtype = torch.float)
        actions = torch.tensor(actions , dtype = torch.float)
        realizations = len(observations)
        if   mode=='train':  
            size, offset = int(realizations*0.70), int(realizations*0.00)
        elif mode=='valid':  
            size, offset = int(realizations*0.15), int(realizations*0.70)
        elif mode=='test':   
            size, offset = int(realizations*0.15), int(realizations*0.85)
        else:    raise Exception('Wrong name!')


        # define size, input and output matrices
        self.size   = size
        self.input  = torch.zeros((size, 4), dtype=torch.float) # I think this is right.. 
        self.output = torch.zeros((size, 2),dtype=torch.float)

        # fill matrices with the data
        self.input[:] = observations[offset:offset+size]
        self.output[:]  = actions[offset:offset+size]

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        return self.input[idx], self.output[idx]


In [7]:
def create_datasets(batch_size):
    
    train_Dataset = make_Dataset('train')
    train_loader  = DataLoader(dataset=train_Dataset, batch_size=batch_size, 
                               shuffle=True)

    valid_Dataset = make_Dataset('valid')
    valid_loader  = DataLoader(dataset=valid_Dataset, batch_size=batch_size, 
                               shuffle=True)

    test_Dataset  = make_Dataset('test')
    test_loader   = DataLoader(dataset=test_Dataset,  batch_size=batch_size, 
                               shuffle=True)

    return train_loader , valid_loader , test_loader

In [8]:
train_loader , valid_loader , test_loader  = create_datasets(batch_size = 16)

# Input = Previous Observation 

# Output = Action to take

This is based entirely on random training and seeing which actions perform best, nothing to do with Q learning (from what I can tell). If I wanted this to include some future discount of rewards, that information would need to be saved as a time series to predict future value versus current. 

For the model, I'm going to use Pytorch (not Lightning) :-)

In [9]:
class CartpoleModel(nn.Module):
    """
    Simple 2 hidden layer network to learn based on observation, what move to make next. 
    
    Output's a softmax so it is a probability :)
    """
    def __init__(self):
        super(CartpoleModel, self).__init__()

        self.fc1 = Linear(4, 128)
        self.fc2 = Linear(128, 128)
        self.fc3 = Linear(128, 2)


    def forward(self, x):

        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.softmax(x,dim=1)
        return output
    
 

In [10]:

#initialize model parameters
epochs       = 15
learning_rate = 1e-3
# weight_decay = 1e-5 

In [11]:
model = CartpoleModel()

In [12]:
total_params = sum(p.numel() for p in model.parameters())
print('Total number of parameters in the network: %d'%total_params)

Total number of parameters in the network: 17410


In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("device " , device)

device  cpu


In [14]:
# best-model fname
f_best_model = 'BestModel_Cartpole.pt'


In [15]:
# define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 


In [16]:

# load best-model in case it exists
if os.path.exists(f_best_model):  
    print('loading best model...')
    model.load_state_dict(torch.load(f_best_model))

loading best model...


In [17]:
# do validation with the best-model and compute loss
model.eval() 
count, best_loss = 0, 0.0
with torch.no_grad():
    for observations, actions_true  in valid_loader:
        actions_valid = model(observations)
        error    = criterion(actions_valid, actions_true)
        best_loss += error.cpu().numpy()
        count += 1
best_loss /= count
print('validation error = %.3e'%best_loss)


validation error = 6.608e-01


In [18]:
for epoch in range(epochs):
    # TRAIN
    model.train()
    count, loss_train = 0, 0.0
    for observations, actions_true in train_loader:
        # Forward Pass
        optimizer.zero_grad()
        actions_pred = model(observations)
        loss    = criterion(actions_pred, actions_true)
        loss_train += loss.detach().numpy()
        
        # Backward Prop
        loss.backward()
        optimizer.step()
        
        count += 1
        
    loss_train /= count
    
    
    # VALID
    model.eval() 
    count, loss_valid = 0, 0.0
    with torch.no_grad():
        for observations, actions_true  in valid_loader:
            actions_pred = model(observations)
            error    = criterion(actions_pred, actions_true)   
            loss_valid += error.cpu().numpy()
            count += 1
    loss_valid /= count
    
    # TEST
    model.eval() 
    count, loss_test = 0, 0.0
    with torch.no_grad():
        for observations, actions_true  in test_loader:
          
            actions_pred = model(observations)
            error    = criterion(actions_pred, actions_true) 
            loss_test += error.cpu().numpy()
            count += 1
    
    # Save Best Model 
    if loss_valid<best_loss:
        best_loss = loss_valid
        torch.save(model.state_dict(), f_best_model)
        print('%03d %.4e %.4e %.4e (saving)'\
              %(epoch, loss_train, loss_valid, loss_test))    
        
    else:
        print('%03d %.4e %.4e %.4e'%(epoch, loss_train, loss_valid, loss_test))
    

000 6.5692e-01 6.5128e-01 3.6591e+01 (saving)
001 6.4591e-01 6.4820e-01 3.6667e+01 (saving)
002 6.4361e-01 6.4794e-01 3.6761e+01 (saving)
003 6.4225e-01 6.4450e-01 3.6678e+01 (saving)
004 6.4128e-01 6.4461e-01 3.6754e+01
005 6.3927e-01 6.4886e-01 3.6878e+01
006 6.3784e-01 6.4523e-01 3.6746e+01
007 6.3803e-01 6.4645e-01 3.6857e+01
008 6.3598e-01 6.4715e-01 3.6776e+01
009 6.3609e-01 6.4798e-01 3.7249e+01
010 6.3521e-01 6.5087e-01 3.6983e+01
011 6.3615e-01 6.4736e-01 3.7094e+01
012 6.3487e-01 6.4946e-01 3.6854e+01
013 6.3491e-01 6.4854e-01 3.7031e+01
014 6.3347e-01 6.5200e-01 3.7141e+01


# See how this best model does in the wild.  

In [19]:
model_best = CartpoleModel()
model_best.load_state_dict(torch.load(f_best_model))
#load best model in now 

<All keys matched successfully>

In [20]:
scores = []
choices = []
env = gym.make("CartPole-v0")
env.reset()
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    env.reset()
    for _ in range(goal_steps):
        env.render()

        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(model_best.forward(torch.Tensor(prev_obs).view(1,4)).detach().numpy())

        choices.append(action)
                
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score+=reward
        if done: break
            
    scores.append(score)
    
print('Best Score:',max(scores))
print('Worst Score:',min(scores))

print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))
print(score_requirement)

Best Score: 200.0
Worst Score: 200.0
Average Score: 200.0
choice 1:0.4985  choice 0:0.5015
100


What's interesting is that it looks like this trains on data that never reaches 200, but can still get that far. 