# DIAMBRA AI Agent

Based on [Stable Baselines](https://stable-baselines.readthedocs.io/en/master/) Reinforcement Learning library

Using [Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf) algorithm

This agent takes about 6 weeks of 24/7 training on a mid/low level workstation (i5 proc, 16 Gb Ram, 4Gb Nvidia GPU) to reach about 70M steps and a mean cumulative reward of 14. 

In [None]:
import sys, os
import time
import cv2
import numpy as np

from diambra_environment.diambraGym import diambraGym
from diambra_environment.makeDiambraEnvSB import make_diambra_env



In [None]:
repo_base_path = os.path.join(os.path.abspath(""), "../../") # Absolute path to your DIAMBRA environment

diambraEnvKwargs = {}
diambraEnvKwargs["gameId"]          = "doapp"
diambraEnvKwargs["roms_path"]       = os.path.join(repo_base_path, "roms/") # Absolute path to roms

diambraEnvKwargs["mame_diambra_step_ratio"] = 6
diambraEnvKwargs["render"]      = True
diambraEnvKwargs["lock_fps"]    = False # Locks to 60 FPS
diambraEnvKwargs["sound"]       = diambraEnvKwargs["lock_fps"] and diambraEnvKwargs["render"]

# 1P
diambraEnvKwargs["player"] = "Random"

# Game specific
diambraEnvKwargs["difficulty"] = 3
diambraEnvKwargs["characters"]  = [["Kasumi", "Random"], ["Kasumi", "Random"]]
diambraEnvKwargs["charOutfits"] = [2, 2]

In [None]:
tensorBoardFolder = "./{}_ppo2_TB_CustCnn_bL_d_noComb/".format(diambraEnvKwargs["gameId"])
modelFolder = "./{}_ppo2_Model_CustCnn_bL_d_noComb/".format(diambraEnvKwargs["gameId"])

os.makedirs(modelFolder, exist_ok=True)

In [None]:
import tensorflow as tf

from diambra_environment.customPolicies.utils import linear_schedule, AutoSave
from diambra_environment.customPolicies.f import *

from stable_baselines import PPO2

In [None]:
# DIAMBRA gym kwargs
diambraGymKwargs = {}
diambraGymKwargs["P2brain"]               = None
diambraGymKwargs["continue_game"]         = 0.0
diambraGymKwargs["show_final"]            = False
diambraGymKwargs["gamePads"]              = [None, None]
diambraGymKwargs["actionSpace"]           = ["discrete", "multiDiscrete"]
diambraGymKwargs["attackButCombinations"] = [False, False]
diambraGymKwargs["actBufLen"]             = 12

In [None]:
# Wrappers kwargs
wrapperKwargs = {}
wrapperKwargs["hwc_obs_resize"]    = [256, 256, 1]
wrapperKwargs["normalize_rewards"] = True
wrapperKwargs["clip_rewards"]      = False
wrapperKwargs["frame_stack"]       = 6
wrapperKwargs["dilation"]          = 1
wrapperKwargs["scale"]             = True
wrapperKwargs["scale_mod"]         = 0

In [None]:
# Additional Observations
keyToAdd = []
keyToAdd.append("actionsBuf")
keyToAdd.append("ownHealth")
keyToAdd.append("oppHealth")
keyToAdd.append("ownPosition")
keyToAdd.append("oppPosition")
keyToAdd.append("stage")
keyToAdd.append("character")

In [None]:
numEnv=8

envId = "Train"
env = make_diambra_env(diambraGym, env_prefix=envId, num_env=numEnv, seed=timeDepSeed, 
                       diambra_kwargs=diambraEnvKwargs, 
                       diambra_gym_kwargs=diambraGymKwargs,
                       wrapper_kwargs=wrapperKwargs, 
                       key_to_add=keyToAdd, use_subprocess=True)

In [None]:
print("Obs_space = ", env.observation_space)
print("Obs_space type = ", env.observation_space.dtype)
print("Obs_space high = ", env.observation_space.high)
print("Obs_space low = ", env.observation_space.low)

In [None]:
print("Act_space = ", env.action_space)
print("Act_space type = ", env.action_space.dtype)
if diambraGymKwargs["actionSpace"][0] == "multiDiscrete":
    print("Act_space n = ", env.action_space.nvec)
else:
    print("Act_space n = ", env.action_space.n)

In [None]:
# Policy param

n_actions = env.get_attr("n_actions")[0][0]
actBufLen = diambraGymKwargs["actBufLen"]

policyKwargs={}
policyKwargs["n_add_info"] = actBufLen*(n_actions[0]+n_actions[1]) + len(keyToAdd)-2 # No Char Info
policyKwargs["layers"] = [64, 64]

In [None]:
# PPO param

setGamma = 0.94

setLearningRate = linear_schedule(2.5e-4, 2.5e-6)
#setLearningRate = linear_schedule(5.0e-5, 2.5e-6)

setClipRange = linear_schedule(0.15, 0.025)
#setClipRange = linear_schedule(0.05, 0.025)

setClipRangeVf = setClipRange

In [None]:
# Initialize the model
model = PPO2(CustCnnPolicy, env, verbose=1, 
             gamma=setGamma, nminibatches=4, noptepochs=4, n_steps=128,
             learning_rate=setLearningRate, cliprange=setClipRange, 
             cliprange_vf=setClipRangeVf, 
             tensorboard_log=tensorBoardFolder, policy_kwargs=policyKwargs)

#OR

# Load the trained agent
#model = PPO2.load(os.path.join(modelFolder, "20M"), env=env, tensorboard_log=tensorBoardFolder, 
#                  policy_kwargs=policyKwargs, gamma = setGamma, learning_rate=setLearningRate, 
#                  cliprange=setClipRange, cliprange_vf=setClipRangeVf)

In [None]:
print("Model discount factor = ", model.gamma)

In [None]:
# Create the callback: autosave every USER DEF steps
autoSaveCallback = AutoSave(check_freq=1000000, numEnv=numEnv, save_path=modelFolder+"0M_")

# Train the agent
time_steps = 20000000
model.learn(total_timesteps=time_steps, callback=autoSaveCallback)

In [None]:
# Save the agent
model.save(os.path.join(modelFolder, "20M"))

## Evaluation

In [None]:
# Initialize a new evaluation environment
diambraEnvKwargs["render"] = True

envId = "Test"
env = make_diambra_env(diambraGym, env_prefix=envId, num_env=1, seed=timeDepSeed, 
                       diambra_kwargs=diambraEnvKwargs, diambra_gym_kwargs=diambraGymKwargs,
                       wrapper_kwargs=wrapperKwargs, key_to_add=keyToAdd)

In [None]:
observation = env.reset()

cumulativeEpRew = 0.0
cumulativeEpRewAll = []

maxNumEp = 10
currNumEp = 0

while currNumEp < maxNumEp:

    action = model.predict(observation, deterministic=True)
    #action_prob = model.action_probability(observation, states)
    #print("Action probabilities = ", action_prob)
    #print("Max action = ", np.argmax(action_prob))
    #print("Action = ", action)
    
    observation, reward, done, info = env.step(action[0])
    
    cumulativeEpRew += reward
    
    if np.any(done):
        currNumEp += 1
        print("Ep. # = ", currNumEp)
        print("Ep. Cumulative Rew # = ", cumulativeEpRew)
        cumulativeEpRewAll.append(cumulativeEpRew)
        cumulativeTotRew += cumulativeEpRew
        cumulativeEpRew = 0.0
    
env.close()

In [None]:
print("Cumulative reward = ", cumulativeEpRewAll)    
print("Mean cumulative reward = ", np.mean(cumulativeEpRewAll))    
print("Std cumulative reward = ", np.std(cumulativeEpRewAll))  