# AI Pong project
### M.L. HofstÃ© (914714) and S. Deloddere

In [2]:
import torch
import torch.nn.functional as F

class Policy(torch.nn.Module):
    def __init__(self, state_space, action_space):
        super().__init__()
        hidden = 256
        self.fc1 = torch.nn.Linear(state_space, hidden)
        self.fc2 = torch.nn.Linear(hidden, action_space)
        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if type(m) is torch.nn.Linear:
                torch.nn.init.normal_(m.weight)
                torch.nn.init.zeros_(m.bias)
                
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.nn.Softmax(self.fc2(x))


class Agent(object):
    def __init__(self, env, player_id=1):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = Policy(env.observation_space.shape[-1], env.action_space.n).to(self.device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=1e-4)
        self.player_id = player_id
        self.name = 'Pippi-O'

    def get_name(self):
        """
        Interface function to retrieve the agents name
        """
        return self.name
    
    def load_model(self, location):
        """
        Interface function to loads a trained model
        """
        self.policy.load_state_dict(torch.load(location, map_location=self.device))
        self.policy = self.policy.to(self.device)
        
    def save_model(self, location):
        """
        Interface function to save the current model 
        """
        torch.save(self.policy.state_dict(), location)

    def get_action(self, ob=None):
        """
        Interface function that returns the action that the agent took based
        on the observation ob
        """
        return

    def reset(self):
        """
        Interface function that resets the agent
        """
        return

In [4]:
import matplotlib.pyplot as plt
import gym
import numpy as np
import wimblepong

# Load the environment
env = gym.make("WimblepongMultiplayer-v0")
player = Agent(env)

location = 'policy.pth'
player.save_model(location)
player.load_model(location)

In [5]:
episodes = 1000
opponent = wimblepong.SimpleAi(env, 2)
env.set_names(player.get_name(), opponent.get_name())

win1 = 0
for i in range(0, episodes):
    while True:
        action1 = player.get_action()
        action2 = opponent.get_action()
        (ob1, ob2), (rew1, rew2), done, info = env.step((action1, action2))
        #img = Image.fromarray(ob1)
        #img.save("ob1.png")
        #img = Image.fromarray(ob2)
        #img.save("ob2.png")
        if rew1 == 10:
            win1 += 1
        env.render()
        if done:
            observation = env.reset()
            break

In [56]:
reward_history, timestep_history = [], []
average_reward_history = []

# Run actual training
for episode_number in range(train_episodes):
    reward_sum, timesteps = 0, 0
    done = False
    # Reset the environment and observe the initial state
    observation = env.reset()

    # Loop until the episode is over
    while not done:
        # Get action from the agent
        action, action_probabilities = agent.get_action(observation)
        previous_observation = observation

        # Perform the action on the environment, get new state and reward
        observation, reward, done, info = env.step(action.detach().numpy())

        # Store action's outcome (so that the agent can improve its policy)
        agent.store_outcome(previous_observation, observation, action_probabilities, reward, done)

        # Store total episode reward
        reward_sum += reward
        timesteps += 1

    if print_things:
        print("Episode {} finished. Total reward: {:.3g} ({} timesteps)"
              .format(episode_number, reward_sum, timesteps))

    # Bookkeeping (mainly for generating plots)
    reward_history.append(reward_sum)
    timestep_history.append(timesteps)
    if episode_number > 100:
        avg = np.mean(reward_history[-100:])
    else:
        avg = np.mean(reward_history)
    average_reward_history.append(avg)

    # Let the agent do its magic (update the policy)
    agent.update_policy(episode_number)

# Training is finished - plot rewards
plt.plot(reward_history)
plt.plot(average_reward_history)
plt.legend(["Reward", "100-episode average"])
plt.title("AC reward history (episodic)")
plt.show()
print("Training finished.")