Copyright **`(c)`** 2022 Giovanni Squillero `<squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  


# Lab 1: Set Covering

First lab + peer review. List this activity in your final report, it will be part of your exam.

## Task

Given a number $N$ and some lists of integers $P = (L_0, L_1, L_2, ..., L_n)$, 
determine, if possible, $S = (L_{s_0}, L_{s_1}, L_{s_2}, ..., L_{s_n})$
such that each number between $0$ and $N-1$ appears in at least one list

$$\forall n \in [0, N-1] \ \exists i : n \in L_{s_i}$$

and that the total numbers of elements in all $L_{s_i}$ is minimum. 

## Instructions

* Create the directory `lab1` inside the course repo (the one you registered with Andrea)
* Put a `README.md` and your solution (all the files, code and auxiliary data if needed)
* Use `problem` to generate the problems with different $N$
* In the `README.md`, report the the total numbers of elements in $L_{s_i}$ for problem with $N \in [5, 10, 20, 100, 500, 1000]$ and the total number on $nodes$ visited during the search. Use `seed=42`.
* Use `GitHub Issues` to peer review others' lab

## Notes

* Working in group is not only allowed, but recommended (see: [Ubuntu](https://en.wikipedia.org/wiki/Ubuntu_philosophy) and [Cooperative Learning](https://files.eric.ed.gov/fulltext/EJ1096789.pdf)). Collaborations must be explicitly declared in the `README.md`.
* [Yanking](https://www.emacswiki.org/emacs/KillingAndYanking) from the internet is allowed, but sources must be explicitly declared in the `README.md`.

**Deadline**

* Sunday, October 16th 23:59:59 for the working solution
* Sunday, October 23rd 23:59:59 for the peer reviews

# Code

In [1]:
import logging
import random
from copy import copy

In [2]:
import random
import platform
from collections import Counter

In [3]:
from gx_utils import *

## Problem instances generator

In [4]:
def problem(N, seed=None):
    """Creates an instance of the problem"""

    random.seed(seed)
    return [
        list(set(random.randint(0, N - 1) for n in range(random.randint(N // 5, N // 2))))
        for n in range(random.randint(N, N * 5))
    ]

## Greedy

A simplistic greedy algorithm. *Bloat* compares the size against the theoretical optimal one.

In [5]:
def greedy(N, all_lists):
    """Vanilla greedy algorithm"""

    goal = set(range(N))
    covered = set()
    solution = list()
    all_lists = sorted(all_lists, key=lambda l: len(l))
    while goal != covered:
        x = all_lists.pop(0)
        if not set(x) < covered:
            solution.append(x)
            covered |= set(x)
    logging.debug(f"{solution}")
    return solution

In [6]:
# logging.info(f" Benchmarking on {platform.platform()}")
# tmp = problem(1_000, seed=42)
#%timeit greedy(1_000, tmp)

In [7]:
logging.getLogger().setLevel(logging.INFO)

for N in [5, 10, 20, 100, 500, 1000]:
    solution = greedy(N, problem(N, seed=42))
    logging.info(
        f" Greedy solution for N={N:,}: "
        + f"w={sum(len(_) for _ in solution):,} "
        + f"(bloat={(sum(len(_) for _ in solution)-N)/N*100:.0f}%)"
    )

INFO:root: Greedy solution for N=5: w=5 (bloat=0%)
INFO:root: Greedy solution for N=10: w=13 (bloat=30%)
INFO:root: Greedy solution for N=20: w=46 (bloat=130%)
INFO:root: Greedy solution for N=100: w=332 (bloat=232%)
INFO:root: Greedy solution for N=500: w=2,162 (bloat=332%)
INFO:root: Greedy solution for N=1,000: w=4,652 (bloat=365%)


## Dijkstra's

In [8]:
def dijkstra(N, all_lists):
    """Vanilla Dijkstra's algorithm"""

    GOAL = set(range(N))
    all_lists = tuple(set(tuple(_) for _ in all_lists))
    frontier = PriorityQueue()
    nodes = 0

    def state_to_set(state):
        return set(sum((e for e in state), start=()))

    def goal_test(state):
        return state_to_set(state) == GOAL

    def possible_steps(state):
        current = state_to_set(state)
        return [l for l in all_lists if not set(l) <= current]

    def w(state):
        cnt = Counter()
        cnt.update(sum((e for e in state), start=()))
        return sum(cnt[c] - 1 for c in cnt if cnt[c] > 1), -sum(cnt[c] == 1 for c in cnt)

    state = tuple()
    while state is not None and not goal_test(state):
        nodes += 1
        for s in possible_steps(state):
            frontier.push((*state, s), p=w((*state, s)))
        state = frontier.pop()

    logging.debug(f"dijkstra: SOLVED! nodes={nodes:,}; w={sum(len(_) for _ in state):,}; iw={w(state)})")
    return state

In [9]:
logging.getLogger().setLevel(logging.DEBUG)

for N in [5, 10, 20]:
    solution = dijkstra(N, problem(N, seed=42))
    logging.info(
        f" Solution for N={N:,}: "
        + f"w={sum(len(_) for _ in solution):,} "
        + f"(bloat={(sum(len(_) for _ in solution)-N)/N*100:.0f}%)"
    )

DEBUG:root:dijkstra: SOLVED! nodes=3; w=5; iw=(0, -5))
INFO:root: Solution for N=5: w=5 (bloat=0%)
DEBUG:root:dijkstra: SOLVED! nodes=3; w=10; iw=(0, -10))
INFO:root: Solution for N=10: w=10 (bloat=0%)
DEBUG:root:dijkstra: SOLVED! nodes=14,095; w=23; iw=(3, -17))
INFO:root: Solution for N=20: w=23 (bloat=15%)


## Hill Climbing

In [10]:
def hc(N, all_lists):
    """Vanilla Hill Climber"""
    all_lists = set(tuple(_) for _ in all_lists)

    def evaluate(state):
        cnt = Counter()
        cnt.update(sum((e for e in state), start=()))
        return len(cnt), -cnt.total()

    def tweak(solution):
        new_solution = set(solution)
        while new_solution and random.random() < 0.7:
            r = random.choice(list(new_solution))
            new_solution.remove(r)
        while random.random() < 0.7:
            a = random.choice(list(all_lists - solution))
            new_solution.add(a)
        return new_solution

    current_solution = set()
    useless_steps = 0
    while useless_steps < 10_000:
        useless_steps += 1
        candidate_solution = tweak(current_solution)
        if evaluate(candidate_solution) > evaluate(current_solution):
            useless_steps = 0
            current_solution = copy(candidate_solution)
            logging.debug(f"New solution: {evaluate(current_solution)}")
    return current_solution

In [11]:
logging.getLogger().setLevel(logging.INFO)

for N in [5, 10, 20, 100, 500, 1000]:
    solution = hc(N, problem(N, seed=42))
    logging.info(
        f" Solution for N={N:,}: "
        + f"w={sum(len(_) for _ in solution):,} "
        + f"(bloat={(sum(len(_) for _ in solution)-N)/N*100:.0f}%)"
    )

INFO:root: Solution for N=5: w=5 (bloat=0%)
INFO:root: Solution for N=10: w=11 (bloat=10%)
INFO:root: Solution for N=20: w=24 (bloat=20%)
INFO:root: Solution for N=100: w=214 (bloat=114%)
INFO:root: Solution for N=500: w=1,504 (bloat=201%)
INFO:root: Solution for N=1,000: w=3,383 (bloat=238%)
