Copyright **`(c)`** 2022 Giovanni Squillero `<squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  


# Lab 3: Policy Search

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The player **taking the last object wins**.

* Task3.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task3.2: An agent using evolved rules
* Task3.3: An agent using minmax
* Task3.4: An agent using reinforcement learning

## Instructions

* Create the directory `lab3` inside the course repo 
* Put a `README.md` and your solution (all the files, code and auxiliary data if needed)

## Notes

* Working in group is not only allowed, but recommended (see: [Ubuntu](https://en.wikipedia.org/wiki/Ubuntu_philosophy) and [Cooperative Learning](https://files.eric.ed.gov/fulltext/EJ1096789.pdf)). Collaborations must be explicitly declared in the `README.md`.
* [Yanking](https://www.emacswiki.org/emacs/KillingAndYanking) from the internet is allowed, but sources must be explicitly declared in the `README.md`.

## Deadlines ([AoE](https://en.wikipedia.org/wiki/Anywhere_on_Earth))

* Sunday, December 4th for Task3.1 and Task3.2
* Sunday, December 11th for Task3.3 and Task3.4
* Sunday, December 18th for all reviews

In [8]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor

## The *Nim* and *Nimply* classes

In [9]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [10]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

## Sample (and silly) startegies 

In [11]:
def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects)

In [12]:
def gabriele(state: Nim) -> Nimply:
    """Pick always the maximum possible number of the lowest row"""
    possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1)]
    return Nimply(*max(possible_moves, key=lambda m: (-m[0], m[1])))

## Evaluation on *NUM_MATCHES* matches for *NIM_SIZE* size nim 

In [13]:
NUM_MATCHES = 100
NIM_SIZE = 25

logging.getLogger().setLevel(logging.DEBUG)

def evaluate(strategy: Callable, opponent_strategy: Callable) -> float:
    opponent = (strategy, opponent_strategy)
    won = 0

    for game_num in range(NUM_MATCHES):
        # nim = Nim(NIM_SIZE)
        nim = Nim(random.randint(10,20))
        player = 1 if random.random() < 0.5 else 0
        while nim:
            ply = opponent[player](nim)
            nim.nimming(ply)
            player = 1 - player
        if player == 1:
            won += 1
        # logging.debug(f"end of game: {game_num + 1}")
    return won / NUM_MATCHES * 100

## Oversimplified match

In [14]:
# logging.getLogger().setLevel(logging.DEBUG)

# strategy = (make_strategy({"p": 0.1}), optimal_startegy)

# nim = Nim(11)
# logging.debug(f"status: Initial board  -> {nim}")
# player = 0
# while nim:
#     ply = strategy[player](nim)
#     nim.nimming(ply)
#     logging.debug(f"status: After player {player} -> {nim}")
#     player = 1 - player
# winner = 1 - player
# logging.info(f"status: Player {winner} won!")

## Task 1: expert system

In [15]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result

In [16]:
def expert_strategy_nim(state: Nim) -> Nimply:
    '''
    Strategy that apply the nim sum on next move to select the best one
    '''

    max_row_num = 0 # hold the row number of row with max objects

    for (row_num, num_objects) in enumerate(state.rows):
        if state.rows[max_row_num] < num_objects:
            max_row_num = row_num 
        for num_new_objects in range (1, state.k if (state._k != None and num_objects >= state.k) else num_objects+1):
            next_move = (row_num, num_new_objects)
            tmp_state = deepcopy(state)
            tmp_state.nimming(next_move)
            if nim_sum(tmp_state) == 0:
                return next_move

    # if do not find a nim_sum = 0 for next move do the move with less impact (remove 1 object from the row with most objects)
    less_impact_move =  (max_row_num, 1)
    return less_impact_move

In [17]:
def expert_strategy(state: Nim) -> Nimply:
    '''
    My fixed strategy to play nim: the objective is to have in the end game all onece row and a odd number of rows
    '''
    longest_row = max((x for x in enumerate(state.rows)), key=lambda y: y[1])
    onece_row_number = sum(o == 1 for o in state.rows)
    active_row_number = sum(o > 0 for o in state.rows)

    if onece_row_number % 2 == 1:
        if active_row_number - onece_row_number > 0:
            ply = (longest_row[0], longest_row[1] - 1) # make the longest row of 1 element
        else:
            ply = (longest_row[0], longest_row[1]) # zero one row
    else:
        ply = (longest_row[0], longest_row[1]) # zero one row

    return ply


In [18]:
def variable_expert_strategy() -> Nimply:
    call = 0.
    def call(state: Nim) -> Nimply:
        max_row_num = 0 # hold the row number of row with max objects
        p = call / NUM_MATCHES
        call += 1
        if random.random() < p:
            for (row_num, num_objects) in enumerate(state.rows):
                if state.rows[max_row_num] < num_objects:
                    max_row_num = row_num 
                for num_new_objects in range (1, state.k if (state._k != None and num_objects >= state.k) else num_objects+1):
                    next_move = (row_num, num_new_objects)
                    tmp_state = deepcopy(state)
                    tmp_state.nimming(next_move)
                    if nim_sum(tmp_state) == 0:
                        return next_move

            # if do not find a nim_sum = 0 for next move do the move with less impact (remove 1 object from the row with most objects)
            less_impact_move =  (max_row_num, 1)
            return less_impact_move
        else:
            return pure_random(state)
    return call


In [175]:
logging.getLogger().setLevel(logging.INFO)
res = evaluate(expert_strategy_nim, pure_random)
logging.info(f"status: best strategy won vs random strategy {res}% of matches")
res = evaluate(expert_strategy, pure_random)
logging.info(f"status: expert player won vs random strategy {res}% of matches")

INFO:root:status: best strategy won vs random strategy 100.0% of matches
INFO:root:status: expert player won vs random strategy 91.0% of matches


## Task 2: evolved strategy

In [19]:
def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [(r, o) for r, c in enumerate(state.rows) for o in range(1, state.k if (state._k != None and c >= state.k) else c+1)]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["once_rows_number"] = sum(o == 1 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])

    return cooked

In [181]:
def make_strategy(genome: dict) -> Callable:
    def evolvable(state: Nim) -> Nimply:
        data = cook_status(state)

        end_game_threshold = data["active_rows_number"] / len(state.rows)
        
        if genome["end_game_p"] > end_game_threshold:
            if data["longest_row"][1] > 1: # check if the longest row is grather than 1
                ply = (data["longest_row"][0], data["longest_row"][1] - 1) # make the longest row of 1 element
            else:
                ply = (data["longest_row"][0], data["longest_row"][1]) # zero one row
        elif data["once_rows_number"] % 2 == 1:
            if data["active_rows_number"] - data["once_rows_number"] > 0:
                ply = (data["longest_row"][0], data["longest_row"][1] - 1) # make the longest row of 1 element
            else:
                ply = (data["longest_row"][0], data["longest_row"][1]) # zero one row
        else:
            ply = (data["longest_row"][0], data["longest_row"][1]) # zero one row

        return ply

    return evolvable

In [240]:
logging.getLogger().setLevel(logging.INFO)

player_strategy = make_strategy({"end_game_p" : 0.001})
system_strategy = pure_random

res = evaluate(player_strategy, system_strategy)
logging.info(f"status: player won vs {system_strategy.__name__} {res}% of matches")

INFO:root:status: player won vs pure_random 95.0% of matches


In [28]:
POPULATION_SIZE = 50
OFFSPRING_SIZE = 20
N = 100

In [94]:
def select_parent(population, tournament_size=2):
    return max(random.choices(population, k=tournament_size), key= lambda i: i[1])

def mutation(g):
    tmp = deepcopy(g)

    if random.random() < 0.5 and g["end_game_p"] > 0.005:
        tmp["end_game_p"] = g["end_game_p"] -0.005
    elif g["end_game_p"] <= 0.995:
        tmp["end_game_p"] = g["end_game_p"] +0.005
    return tmp

In [242]:
def evaluate_mean(params):
    wr = []
    K = 5 # number of evaluation
    for _ in range(K):
        wr.append(evaluate(make_strategy(params), pure_random))
    return sum(wr) / K

In [224]:
def init_population():
    return [(_, evaluate_mean(_)) for _ in [{"end_game_p": random.random()} for __ in range(POPULATION_SIZE)]]

In [226]:
def evolve_strategy():
    population = init_population()
    generations = 0
    while generations < N:
        generations += 1
        offspring = {}
        while len(offspring) < OFFSPRING_SIZE:
            p = select_parent(population)
            m = mutation(p[0])
            offspring[m["end_game_p"]] = ((m, evaluate_mean(m)))
        population = sorted(population + list(offspring.values()), key=lambda i: -i[1])[:POPULATION_SIZE]
        logging.debug(f"gen: {generations}, param: {population[0]}")
    return population[0]

In [227]:
logging.getLogger().setLevel(logging.DEBUG)

best_param, wr = evolve_strategy()
logging.info(f"best param: {best_param}, wr: {wr}")

DEBUG:root:gen: 1, param: ({'end_game_p': 0.04353971502607968}, 19.4)
DEBUG:root:gen: 2, param: ({'end_game_p': 0.04353971502607968}, 19.4)
DEBUG:root:gen: 3, param: ({'end_game_p': 0.04353971502607968}, 19.4)
DEBUG:root:gen: 4, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 5, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 6, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 7, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 8, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 9, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 10, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 11, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 12, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 13, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen: 14, param: ({'end_game_p': 0.12857623549669617}, 20.0)
DEBUG:root:gen:

KeyboardInterrupt: 