In [None]:
%load_ext autoreload
%autoreload 2
from google.colab import drive
drive.mount('/content/drive')
git_token='tok'
username='maxmarsakov'
repository='durak-project'
%cd '/content/drive/MyDrive/'
%mkdir 'Github'
base_folder='/content/drive/MyDrive/Github'
%cd {base_folder}
%pwd

Mounted at /content/drive
/content/drive/MyDrive
mkdir: cannot create directory ‘Github’: File exists
/content/drive/MyDrive/Github


'/content/drive/MyDrive/Github'

### Run this once: the First time - **clone**

In [None]:
!git clone https://{git_token}@github.com/{username}/{repository}


### Second Time pull
**Important**!!! after this command all local changes will be gone, so be sure to save unsaved work

In [None]:
%cd {base_folder}/{repository}
!git reset --hard HEAD
!git pull

/content/drive/MyDrive/Github/durak-project
Checking out files: 100% (54/54), done.
HEAD is now at 195768d probabalistic threshold
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.
From https://github.com/maxmarsakov/durak-project
   195768d..c7b46f3  master     -> origin/master
Updating 195768d..c7b46f3
Fast-forward
 experiments/dmc_result/durak/model.tar | Bin [31m19020181[m -> [32m19020181[m bytes
 1 file changed, 0 insertions(+), 0 deletions(-)


## run this

In [None]:
%cd {base_folder}/{repository}
!pip install -r requirements.txt
%cd {base_folder}/{repository}
%env PROJECT_PATH={base_folder}/{repository}

/content/drive/MyDrive/Github/durak-project
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting rlcard[torch]
  Downloading rlcard-1.0.7.tar.gz (268 kB)
[K     |████████████████████████████████| 268 kB 9.5 MB/s 
[?25hCollecting translation
  Downloading translation-1.0.5-py3-none-any.whl (9.3 kB)
Collecting GitPython
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 49.6 MB/s 
[?25hCollecting gitdb2
  Downloading gitdb2-4.0.2-py3-none-any.whl (1.1 kB)
Collecting gitdb>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 2.1 MB/s 
[?25hCollecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: rlcard
  Building wheel for rlcard (setup.py) ... [?25l[?25hdone
  Created wheel for rlc

In [None]:
import os
import sys

import torch
import time
sys.path.insert(0,os.environ['PROJECT_PATH'])
sys.path.insert(0,os.environ['PROJECT_PATH']+"/durak_rlcard")

import rlcard
from rlcard.agents import RandomAgent
from rlcard.utils import (
    get_device,
    set_seed,
    tournament,
    reorganize,
    Logger,
    plot_curve,
)
from durak_rlcard.env import DurakEnv
from durak_rlcard.agents import SimpleAgent, SimpleLearningAgent, SimpleProbaAgent
import random
from collections import namedtuple
CustomArgs=namedtuple('CustomArgs',['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir'])

In [None]:
def train(args,env,agent,evaluate_vs=None):

    # Start training
    curr_time=None
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.agent == 'nfsp':
                agent.sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            if args.opponent=="self":
                # if playing against self, copy self as opponent
                env.set_agents([agent,agent])

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                prev_agents=env.agents
                if evaluate_vs is not None:
                    env.set_agents([agent,evaluate_vs])
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0]
                )
                if evaluate_vs is not None:
                    env.set_agents(prev_agents)
            
            if curr_time is None or ( (time.perf_counter()-curr_time) > 60 * args.save_every):
                # as well save the model
                save_path = os.path.join(args.log_dir, 'model.pth')
                torch.save(agent, save_path)
                curr_time=time.perf_counter()

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.agent)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)

In [None]:
def get_agent_opponent(env,args):
    agent,opponent=None,None

    if args.agent == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64,64],
            device=device,
        )
    elif args.agent == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64,64],
            q_mlp_layers=[64,64],
            device=device,
        )
    elif args.agent == 'simple_learning':
        from agents import SimpleLearningAgent
        agent = SimpleLearningAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64,64],
            device=device,
        )
    elif args.agent == 'simple_proba':
        # simple callback is needed to determine 
        # when to use simple vs dqn strategy
        def pcallback(state):
            raw=state['raw_obs']
            deckSize=raw['deckSize']
            # set these hyperparameters
            endCardsSize=10
            probaStart=0.2
            probaEnd=0.8
            if deckSize<=endCardsSize: 
                # endgame
                return 'dqn' if random.random() < probaEnd else 'simple'
            # start game
            return 'dqn' if random.random() < probaStart else 'simple'

        from agents import SimpleProbaAgent
        agent = SimpleProbaAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64,64],
            device=device,
            use_strategy_callback=pcallback
        )
    
    # set opponent
    if args.opponent=='random':
        from rlcard.agents import RandomAgent
        opponent=RandomAgent(num_actions=env.num_actions)
    elif args.opponent=='simple':
        from agents import SimpleAgent
        opponent=SimpleAgent(num_actions=env.num_actions)
    elif args.opponent=='self':
        # copy agent
        opponent=agent

    return tuple([agent,opponent])

# Train DQN on gpu


In [None]:
#!python3 durak_rlcard/dqn.py --cuda=0

### DQN vs Random

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("dqn","random","0",42,10000,2000,100,10,"experiments/dqn_vs_random")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent)

### DQN vs Simple

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("dqn","simple","0",42,5000,2000,100,15,"experiments/dqn_vs_simple")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent)

### Simple learning vs Simple

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("simple_learning","simple","0",42,5000,2000,100,30,"experiments/simple_learning_vs_simple_deep")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
agent = SimpleLearningAgent(
    num_actions=env.num_actions,
    state_shape=env.state_shape[0],
    mlp_layers=[64,64],
    device=device,
)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent)

### Simple learning vs SELF

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("simple_learning","self","0",42,10000,2000,100,15,"experiments/simple_learning_vs_self_new_reward")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
agent = SimpleLearningAgent(
    num_actions=env.num_actions,
    state_shape=env.state_shape[0],
    mlp_layers=[64,64,64],
    device=device,
)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent,evaluate_vs=SimpleAgent(num_actions=env.num_actions))

### Simple learning probabalistic vs Simple

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("simple_learning","self","0",42,10000,1000,100,15,"experiments/simple_proba_vs_self")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
agent =SimpleProbaAgent(
    num_actions=env.num_actions,
    state_shape=env.state_shape[0],
    mlp_layers=[64,64],
    device=device,
    proba_at_start=0.1,
    proba_at_end=0.9,
)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent,evaluate_vs=SimpleAgent(num_actions=env.num_actions))


### Simple probabalistic with Threshold vs Simple

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("simple_learning","self","0",42,5000,1000,100,15,"experiments/simple_proba_vs_self_threshold_0.1_0.6_thr_6")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
agent =SimpleProbaAgent(
    num_actions=env.num_actions,
    state_shape=env.state_shape[0],
    mlp_layers=[64,64],
    device=device,
    proba_at_start=0.1,
    proba_at_end=0.6,
    threshold=6
)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent,evaluate_vs=SimpleAgent(num_actions=env.num_actions))

Deeper

In [None]:
#['agent','opponent','cuda','seed','num_episodes','num_eval_games','evaluate_every','save_every','log_dir']
args=CustomArgs("simple_learning","self","0",42,15000,1000,500,15,"experiments/simple_proba_vs_self_0.1_0.8_256x3_thr6")
# Check whether gpu is available
device = get_device()
set_seed(args.seed)
env = DurakEnv()
agent,opponent=get_agent_opponent(env,args)
agent =SimpleProbaAgent(
    num_actions=env.num_actions,
    state_shape=env.state_shape[0],
    mlp_layers=[365,365,365],
    epsilon_decay_steps=40000,
    device=device,
    proba_at_start=0.1,
    proba_at_end=0.6, # vary -> 0.6
    threshold=0
)
# Initialize the agent and use random agents as opponents
agents = [agent,opponent]
env.set_agents(agents)
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
train(args,env,agent,evaluate_vs=SimpleAgent(num_actions=env.num_actions))

--> Running on the GPU

----------------------------------------
  timestep     |  55
  reward       |  0.391
----------------------------------------


  state_batch, action_batch, reward_batch, next_state_batch, legal_actions_batch, done_batch = self.memory.sample()


INFO - Step 100, rl-loss: 0.29810842871665955
INFO - Copied model parameters to target network.
INFO - Step 1100, rl-loss: 0.08695834875106812
INFO - Copied model parameters to target network.
INFO - Step 2100, rl-loss: 0.10081396996974945
INFO - Copied model parameters to target network.
INFO - Step 3100, rl-loss: 0.05264813452959061
INFO - Copied model parameters to target network.
INFO - Step 4100, rl-loss: 0.014624183997511864
INFO - Copied model parameters to target network.
INFO - Step 5100, rl-loss: 0.025551527738571167
INFO - Copied model parameters to target network.
INFO - Step 6100, rl-loss: 0.022654615342617035
INFO - Copied model parameters to target network.
INFO - Step 7100, rl-loss: 0.04111135005950928
INFO - Copied model parameters to target network.
INFO - Step 8100, rl-loss: 0.0181155726313591
INFO - Copied model parameters to target network.
INFO - Step 9100, rl-loss: 0.023861585184931755
INFO - Copied model parameters to target network.
INFO - Step 10100, rl-loss:

KeyboardInterrupt: ignored

# Evaluate - TODO

In [None]:
!python3 durak_rlcard/evaluate.py --models /content/drive/MyDrive/Github/durak-project/durak_rlcard/experiments_10000_vs_random/model.pth random --cuda=0

--> Running on the GPU
0 /content/drive/MyDrive/Github/durak-project/durak_rlcard/experiments_10000_vs_random/model.pth 0.348
1 random 0.652


In [None]:
!python3 durak_rlcard/evaluate.py --models /content/drive/MyDrive/Github/durak-project/durak_rlcard/experiments_10000_vs_random/model.pth simple --cuda=0

--> Running on the GPU
0 /content/drive/MyDrive/Github/durak-project/durak_rlcard/experiments_10000_vs_random/model.pth 0.0145
1 simple 0.9855


In [None]:
#/content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self_0.1_0.5_64x3_thr0/model.pth
!python3 durak_rlcard/evaluate.py --models /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self_0.1_0.5_64x3_thr0/model.pth random --cuda=0

--> Running on the GPU
0 /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self_0.1_0.5_64x3_thr0/model.pth 0.9795
1 random 0.0205


In [None]:
#/content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self/model.pth
!python3 durak_rlcard/evaluate.py --models /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self/model.pth random --cuda=0

--> Running on the GPU
0 /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self/model.pth 0.759
1 random 0.241


In [None]:
#/content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self/model.pth
!python3 durak_rlcard/evaluate.py --models /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self/model.pth simple --cuda=0

--> Running on the GPU
0 /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self/model.pth 0.074
1 simple 0.926


In [None]:
#/content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self_threshold_0.1_0.6_thr_6/model.pth
!python3 durak_rlcard/evaluate.py --models /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self_threshold_0.1_0.6_thr_6/model.pth random --cuda=0

--> Running on the GPU
0 /content/drive/MyDrive/Github/durak-project/experiments/simple_proba_vs_self_threshold_0.1_0.6_thr_6/model.pth 0.8985
1 random 0.1015


# Train NFSP on gpu

In [None]:
!python3 durak_rlcard/dqn.py --algorithm=nfsp --cuda=0

# Train DMC on gpu

In [None]:
!python3 durak_rlcard/dmc.py --cuda=0

In [None]:
!python3 durak_rlcard/evaluate.py --cuda=0