In [1]:
from typing import Dict
from absl import app
from absl import flags
from absl import logging

import android_env
from dm_env import specs
import numpy as np
import os


In [2]:
# 먼저 load 함수를 통해 안드로이드 환경을 불러옵니다. 
# 에뮬레이터가 실행됩니다.
env = android_env.load(
      emulator_path='~/Android/Sdk/emulator/emulator',
      android_sdk_root='~/Android/Sdk',
      android_avd_home='~/.android/avd',
      avd_name='my_avd',
      adb_path='~/Android/Sdk/platform-tools/adb',
      task_path=f'{os.curdir}/tasks/catch/catch_the_ball_default.textproto',
      run_headless=False)

In [3]:
from android_env.wrappers.discrete_action_wrapper import DiscreteActionWrapper
from android_env.wrappers.image_rescale_wrapper import ImageRescaleWrapper
from android_env.wrappers.float_pixels_wrapper import FloatPixelsWrapper
from android_env.wrappers.tap_action_wrapper import TapActionWrapper

def make_env(env):
    env = ImageRescaleWrapper(env, zoom_factors=(0.0625, 0.0745),  grayscale=True)
    print('-'*128)
    print(env.action_spec())
    print()
    print(env.observation_spec())  
    
    # env = TapActionWrapper(env, touch_only=True)
    # print('-'*128)
    # print(env.action_spec())
    # print()
    # print(env.observation_spec())  
    env = DiscreteActionWrapper(env, (6, 9), redundant_actions=False) # action touch grid: 54 blocks
    print('-'*128)
    print(env.action_spec())
    print()
    print(env.observation_spec())  
    

    return env



In [4]:
# task의 observation과 action에 대한 정보를 보겠습니다.
env = make_env(env)
action_spec = env.action_spec() 
obs_spec = env.observation_spec()

--------------------------------------------------------------------------------------------------------------------------------
{'action_type': DiscreteArray(shape=(), dtype=int32, name=action_type, minimum=0, maximum=2, num_values=3), 'touch_position': BoundedArray(shape=(2,), dtype=dtype('float32'), name='touch_position', minimum=[0. 0.], maximum=[1. 1.])}

{'pixels': Array(shape=(120, 80, 1), dtype=dtype('uint8'), name='pixels'), 'timedelta': Array(shape=(), dtype=dtype('int64'), name='timedelta'), 'orientation': Array(shape=(4,), dtype=dtype('uint8'), name='orientation')}
--------------------------------------------------------------------------------------------------------------------------------
{'action_id': DiscreteArray(shape=(), dtype=int32, name=action_id, minimum=0, maximum=55, num_values=56)}

{'pixels': Array(shape=(120, 80, 1), dtype=dtype('uint8'), name='pixels'), 'timedelta': Array(shape=(), dtype=dtype('int64'), name='timedelta'), 'orientation': Array(shape=(4,), dt

In [5]:
# 환경을 초기화합니다. 에피소드 처음부터 시작하게 됩니다.
# _ = env.reset()

In [6]:
# print(_)


In [7]:
# step_type, reward, time_delta, obs = env.reset()

In [8]:
# print(step_type)
# print(reward)
# print(time_delta)
# print(obs)


In [9]:
# print(len(_))
# print(_[0])
# print(_[1])
# print(_[2])

# print(_.reward)
# print(_.discount)
# print(_.observation['pixels'].shape)
# print(_.observation['pixels'][0])
# print(_.observation['pixels'][0].shape)
# print(_.observation['orientation'])
# print(_.observation['timedelta'])

In [5]:
def get_random_action() -> Dict[str, np.ndarray]:
    """Returns a random AndroidEnv action."""
    action = {}
    for k, v in action_spec.items():
        if isinstance(v, specs.DiscreteArray):
            action[k] = np.random.randint(low=0, high=v.num_values, dtype=v.dtype)
        else:
            action[k] = np.random.random(size=v.shape).astype(v.dtype)
    return action

In [6]:
import numpy as np
import random
import torch
import torch.optim as optim
from models.DQN import ClassicCNN, train_dqn
from buffer.replay_buffer import ReplayBuffer
from utils import *
from torch.utils.tensorboard import SummaryWriter
import os

In [7]:
print(env.action_spec())
print(env.action_spec()['action_id'].num_values)
print(env.observation_spec())
print(env.observation_spec()['pixels'])
print(env.observation_spec()['pixels'].shape)


{'action_id': DiscreteArray(shape=(), dtype=int32, name=action_id, minimum=0, maximum=55, num_values=56)}
56
{'pixels': Array(shape=(120, 80, 1), dtype=dtype('uint8'), name='pixels'), 'timedelta': Array(shape=(), dtype=dtype('int64'), name='timedelta'), 'orientation': Array(shape=(4,), dtype=dtype('uint8'), name='orientation')}
Array(shape=(120, 80, 1), dtype=dtype('uint8'), name='pixels')
(120, 80, 1)


In [8]:
print(env.action_spec().items())

dict_items([('action_id', DiscreteArray(shape=(), dtype=int32, name=action_id, minimum=0, maximum=55, num_values=56))])


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

GAMMA=0.9
MEMORY_SIZE = 100000
BATCH_SIZE = 256   # 32
LEARNING_RATE = 0.000625   # 0.01
TARGET_UPDATE = 2000  # 5
SAVE_PATH = "/home/slowlab/android_env_tutorial/weights/dqn/"
MODEL_NAME = 'DQN'
SAVE_PERIOD = 1000
START_SIZE = 2000



def converter(obs):
    if len(obs['pixels'].shape) < 4:
        obs_pixels = obs['pixels']
        obs_pixels = obs_pixels.transpose(2, 0, 1)
        obs_tensor = torch.tensor(obs_pixels).to(device).float()
        obs_tensor = obs_tensor.unsqueeze(0)
    else:
        obs_pixels = obs_pixels.transpose(2, 0, 1)
        obs_tensor = torch.tensor(obs_pixels).to(device).float()
        obs_tensor = obs_tensor.unsqueeze(0)
    return obs_tensor

In [14]:

    
    
def main():
    writer = SummaryWriter("/home/slowlab/android_env_tutorial/experiments/dqn/train")
    
    startEpsilon = 0.95
    endEpsilon = 0.05
    total_steps = 1000000
    epsilon = startEpsilon
    stepDrop = (startEpsilon - endEpsilon)  * 3 / total_steps
    n_actions = env.action_spec()['action_id'].num_values
    state_dim = env.observation_spec()['pixels'].shape
    H, W, C = state_dim[0], state_dim[1], state_dim[2]
    print(H, W, C)
    behavior_policy = ClassicCNN(C, H, W, 3, 2, n_actions).to(device).float()    # C, H, W, K, S, num_actions
    target_policy = ClassicCNN(C, H, W, 3, 2, n_actions).to(device).float()
    target_policy.load_state_dict(behavior_policy.state_dict())
    optimizer = optim.Adam(behavior_policy.parameters(), lr=LEARNING_RATE)
    memory = ReplayBuffer(MEMORY_SIZE)

    total_rewards = 0
    step_type, reward, discount, obs = env.reset() # return Timestep object (step_type, reward, time_delta, obs)
    for step in range(total_steps):
        if(epsilon > endEpsilon):
            epsilon -= stepDrop
        action_index = behavior_policy.sample_action(converter(obs), epsilon)
        action = {}
        action['action_id'] = action_index
        step_type, reward, discount, next_obs = env.step(action=action)
        writer.add_scalar("reward", reward, step)
        total_rewards += reward
        writer.add_scalar("total_rewards", total_rewards, step)
        writer.add_scalar("epsilon", epsilon, step)
        
        transition = (obs, action_index, reward, next_obs)
        memory.put(transition)
        obs = next_obs
        
        if memory.size() > START_SIZE:
            loss = train_dqn(behavior_policy, target_policy, memory, optimizer,GAMMA, BATCH_SIZE)

        if step > START_SIZE:
            writer.add_scalar("loss", loss, step)
            
        if step % TARGET_UPDATE == 0:
            target_policy.load_state_dict(behavior_policy.state_dict())
        save_model(step, SAVE_PERIOD, SAVE_PATH,target_policy, MODEL_NAME)
        
        if step % 100 == 0:
            print(f'rewards of step {step}: {total_rewards}')
            print(f"# of transitions in memory: {memory.size()}")
    writer.close()
main()

        


120 80 1
8064
8064
rewards of step 0: 0.0
# of transitions in memory: 1
rewards of step 100: 0.0
# of transitions in memory: 101
rewards of step 200: 0.0
# of transitions in memory: 201
rewards of step 300: -1.0
# of transitions in memory: 301
rewards of step 400: -1.0
# of transitions in memory: 401
rewards of step 500: -1.0
# of transitions in memory: 501
rewards of step 600: 0.0
# of transitions in memory: 601
rewards of step 700: 0.0
# of transitions in memory: 701
rewards of step 800: -1.0
# of transitions in memory: 801
rewards of step 900: -1.0
# of transitions in memory: 901
rewards of step 1000: -1.0
# of transitions in memory: 1001
rewards of step 1100: -2.0
# of transitions in memory: 1101
rewards of step 1200: -2.0
# of transitions in memory: 1201
rewards of step 1300: -3.0
# of transitions in memory: 1301
rewards of step 1400: -3.0
# of transitions in memory: 1401
rewards of step 1500: -3.0
# of transitions in memory: 1501
rewards of step 1600: -2.0
# of transitions in mem

In [None]:
save_model(step, SAVE_PERIOD, SAVE_PATH,target_policy, MODEL_NAME)


In [13]:
def eval():
    writer = SummaryWriter("/home/slowlab/android_env_tutorial/experiments/dqn/test")
        
    total_steps = 10000
    n_actions = env.action_spec()['action_id'].num_values
    state_dim = env.observation_spec()['pixels'].shape
    H, W, C = state_dim[0], state_dim[1], state_dim[2]
    print(H, W, C)
    behavior_policy = ClassicCNN(C, H, W, 3, 2, n_actions).to(device).float()    # C, H, W, K, S, num_actions
    SAVE_PATH = "/home/slowlab/android_env_tutorial/weights/dqn/DQN495000.pt"
    load_model(behavior_policy, SAVE_PATH)
    
    total_rewards = 0
    epsilon = 0.0
    step_type, reward, discount, obs = env.reset() # return Timestep object (step_type, reward, time_delta, obs)
    for step in range(total_steps):
        action_index = behavior_policy.sample_action(converter(obs), epsilon)
        action = {}
        action['action_id'] = action_index
        step_type, reward, discount, next_obs = env.step(action=action)
        writer.add_scalar("reward", reward, step)
        total_rewards += reward
        writer.add_scalar("total_rewards", total_rewards, step)
        obs = next_obs

    writer.close()
eval()

120 80 1
8064
