# Assignment

In [178]:
# Import 

import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from degree_freedom_queen import *
from degree_freedom_king1 import *
from degree_freedom_king2 import *
from generate_game import *
from Chess_env import *
import torch.nn.functional as F
import random

size_board = 4

## The Environment

You can find the environment in the file Chess_env, which contains the class Chess_env. To define an object, you need to provide the board size considered as input. In our example, size_board=4. 
Chess_env is composed by the following methods:

1. Initialise_game. The method initialises an episode by placing the three pieces considered (Agent's king and queen, enemy's king) in the chess board. The outputs of the method are described below in order.

     S $\;$ A matrix representing the board locations filled with 4 numbers: 0, no piece in that position; 1, location of the 
     agent's king; 2 location of the queen; 3 location of the enemy king.
     
     X $\;$ The features, that is the input to the neural network. See the assignment for more information regarding the            definition of the features adopted. To personalise this, go into the Features method of the class Chess_env() and change        accordingly.
     
     allowed_a $\;$ The allowed actions that the agent can make. The agent is moving a king, with a total number of 8                possible actions, and a queen, with a total number of $(board_{size}-1)\times 8$ actions. The total number of possible actions correspond      to the sum of the two, but not all actions are allowed in a given position (movements to locations outside the borders or      against chess rules). Thus, the variable allowed_a is a vector that is one (zero) for an action that the agent can (can't)      make. Be careful, apply the policy considered on the actions that are allowed only.
     

2. OneStep. The method performs a one step update of the system. Given as input the action selected by the agent, it updates the chess board by performing that action and the response of the enemy king (which is a random allowed action in the settings considered). The first three outputs are the same as for the Initialise_game method, but the variables are computed for the position reached after the update of the system. The fourth and fifth outputs are:

     R $\;$ The reward. To change this, look at the OneStep method of the class where the rewards are set.
     
     Done $\;$ A variable that is 1 if the episode has ended (checkmate or draw).
     
     
3. Features. Given the chessboard position, the method computes the features.

This information and a quick analysis of the class should be all you need to get going. The other functions that the class exploits are uncommented and constitute an example on how not to write a python code. You can take a look at them if you want, but it is not necessary.






In [179]:
## INITIALISE THE ENVIRONMENT

env=Chess_Env(size_board)

In [180]:
## PRINT 5 STEPS OF AN EPISODE CONSIDERING A RANDOM AGENT

S,X,allowed_a=env.Initialise_game()                       # INTIALISE GAME

print(S)                                                  # PRINT CHESS BOARD (SEE THE DESCRIPTION ABOVE)

print('check? ',env.check)                                # PRINT VARIABLE THAT TELLS IF ENEMY KING IS IN CHECK (1) OR NOT (0)
print('dofk2 ',np.sum(env.dfk2_constrain).astype(int))    # PRINT THE NUMBER OF LOCATIONS THAT THE ENEMY KING CAN MOVE TO


for i in range(5):
    
    a,_=np.where(allowed_a==1)                  # FIND WHAT THE ALLOWED ACTIONS ARE

    a_agent=np.random.permutation(a)[0]         # MAKE A RANDOM ACTION


    S,X,allowed_a,R,Done=env.OneStep(a_agent)   # UPDATE THE ENVIRONMENT
    
    
    ## PRINT CHESS BOARD AND VARIABLES
    print('')
    print(S)
    print(R,'', Done)
    print('check? ',env.check)
    print('dofk2 ',np.sum(env.dfk2_constrain).astype(int))
    
    
    # TERMINATE THE EPISODE IF Done=True (DRAW OR CHECKMATE)
    if Done:
        break
        


[[3 0 0 0]
 [0 0 0 0]
 [0 1 0 0]
 [0 2 0 0]]
check?  0
dofk2  1

[[3 0 0 0]
 [0 0 0 0]
 [1 0 0 0]
 [0 2 0 0]]
0.0  1
check?  0
dofk2  0


In [181]:
# PERFORM N_episodes=1000 EPISODES MAKING RANDOM ACTIONS AND COMPUTE THE AVERAGE REWARD AND NUMBER OF MOVES 

S,X,allowed_a=env.Initialise_game()
N_episodes=1000

# VARIABLES WHERE TO SAVE THE FINAL REWARD IN AN EPISODE AND THE NUMBER OF MOVES 
R_save_random = np.zeros([N_episodes, 1])
N_moves_save_random = np.zeros([N_episodes, 1])

for n in range(N_episodes):
    
    S,X,allowed_a=env.Initialise_game()     # INITIALISE GAME
    Done=0                                  # SET Done=0 AT THE BEGINNING
    i=1                                     # COUNTER FOR THE NUMBER OF ACTIONS (MOVES) IN AN EPISODE
    
    # UNTIL THE EPISODE IS NOT OVER...(Done=0)
    while Done==0:
        
        # SAME AS THE CELL BEFORE, BUT SAVING THE RESULTS WHEN THE EPISODE TERMINATES 
        
        a,_=np.where(allowed_a==1)
        a_agent=np.random.permutation(a)[0]

        S,X,allowed_a,R,Done=env.OneStep(a_agent)
        
        
        if Done:
            
            R_save_random[n]=np.copy(R)
            N_moves_save_random[n]=np.copy(i)

            break

        i=i+1                               # UPDATE THE COUNTER



# AS YOU SEE, THE PERFORMANCE OF A RANDOM AGENT ARE NOT GREAT, SINCE THE MAJORITY OF THE POSITIONS END WITH A DRAW 
# (THE ENEMY KING IS NOT IN CHECK AND CAN'T MOVE)

print('Random_Agent, Average reward:',np.mean(R_save_random),'Number of steps: ',np.mean(N_moves_save_random))



Random_Agent, Average reward: 0.18 Number of steps:  7.215


In [189]:
# INITIALISE THE PARAMETERS OF YOUR NEURAL NETWORK AND...
# PLEASE CONSIDER TO USE A MASK OF ONE FOR THE ACTION MADE AND ZERO OTHERWISE IF YOU ARE NOT USING VANILLA GRADIENT DESCENT...
# WE SUGGEST A NETWORK WITH ONE HIDDEN LAYER WITH SIZE 200. 


S,X,allowed_a=env.Initialise_game()
N_a=np.shape(allowed_a)[0]   # TOTAL NUMBER OF POSSIBLE ACTIONS

N_in=np.shape(X)[0]    ## INPUT SIZE
N_h=200                ## NUMBER OF HIDDEN NODES


## INITALISE YOUR NEURAL NETWORK...
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, N_in, N_h, N_a):
        """Initialize parameters and build model.
        Params
"""
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(N_in, N_h)
        self.fc2 = nn.Linear(N_h, N_a)


    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        return self.fc2(x)


# HYPERPARAMETERS SUGGESTED (FOR A GRID SIZE OF 4)

epsilon_0 = 0.2     # STARTING VALUE OF EPSILON FOR THE EPSILON-GREEDY POLICY
beta = 0.00005      # THE PARAMETER SETS HOW QUICKLY THE VALUE OF EPSILON IS DECAYING (SEE epsilon_f BELOW)
gamma = 0.85        # THE DISCOUNT FACTOR
eta = 0.0035        # THE LEARNING RATE
update_every =200

N_episodes = 100000 # THE NUMBER OF GAMES TO BE PLAYED 

# SAVING VARIABLES
R_save = np.zeros([N_episodes, 1])
N_moves_save = np.zeros([N_episodes, 1])


In [190]:
policy_n = QNetwork(N_in, N_h, N_a).float()
target_n = QNetwork(N_in, N_h, N_a).float()
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(policy_n.parameters(), lr=eta)


In [191]:
# TRAINING LOOP BONE STRUCTURE...
# I WROTE FOR YOU A RANDOM AGENT (THE RANDOM AGENT WILL BE SLOWER TO GIVE CHECKMATE THAN AN OPTIMISED ONE, 
# SO DON'T GET CONCERNED BY THE TIME IT TAKES), CHANGE WITH YOURS ...
tau = 1
rewards = []
for n in range(N_episodes):
    memory = {"state": [], "action": [], "reward": [], "next_state":[], "done":[], "q_expected": []}
    

    epsilon_f = epsilon_0 / (1 + beta * n)   ## DECAYING EPSILON
    Done=0                                   ## SET DONE TO ZERO (BEGINNING OF THE EPISODE)
    i = 1                                    ## COUNTER FOR NUMBER OF ACTIONS
    
    
    S,X,allowed_a=env.Initialise_game()      ## INITIALISE GAME
    #print(n)                                 ## REMOVE THIS OF COURSE, WE USED THIS TO CHECK THAT IT WAS RUNNING
    state = X
    print(f"State: {S}")

    
    
    
    while Done==0:                           ## START THE EPISODE
        
        
        ## THIS IS A RANDOM AGENT, CHANGE IT...
        
        # get indexes from allowed actions
        a,_=np.where(allowed_a==1)

        random_int = random.randint(0,1)
        if epsilon_f > random_int:
            # exploration => perform a random move
           # print("rand move")
            a_agent_idx=np.random.permutation(a)[0]
        else:
            # exploitation
            # print("exploitation")
            policy_n.eval()
            X = torch.from_numpy(X)
            with torch.no_grad():
                a_agent = policy_n(X.float())
                print(a_agent)
            a_agent = a_agent.numpy()
            a_agent = [a_agent[i] for i in a]
            a_agent_idx = np.argmax(a_agent)
            a_agent_idx = a[a_agent_idx]
            policy_n.train()
                
        
        action = int(a_agent_idx)
        
        print(f"ACTION:{action}")

        S_next,X_next,allowed_a_next,R,Done=env.OneStep(action)
        
        memory["state"].extend([state])
        memory["action"].extend([action])
        memory["reward"].extend([R])
        print(f"Check: {env.check}")
        if not Done:
            memory["next_state"].extend([X_next])
            memory["done"].extend([Done])
            memory["q_expected"].append(max(a_agent))

        
        
        ## THE EPISODE HAS ENDED, UPDATE...BE CAREFUL, THIS IS THE LAST STEP OF THE EPISODE
        if Done==1 and i > 1:

            Q_targets_next = target_n(torch.Tensor(memory["next_state"])).max(1)[0].unsqueeze(1)[:,0]
            print(f"Q_targets_next:{Q_targets_next}")
            
            print(f'REWARDS: {memory["reward"]}')
            Q_targets = torch.Tensor(memory["reward"][1:]) + (gamma * Q_targets_next)
                                       
            
            Q_expected = torch.Tensor(memory["q_expected"])
            print(f"q_targets: {Q_targets}")    
            print(f"q_Expected: {Q_expected}")
            
            loss = F.mse_loss(Q_expected, Q_targets)
                                
            # Minimize the loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                                       
            for target_param, policy_param in zip(target_n.parameters(), policy_n.parameters()):
                target_param.data.copy_(tau*policy_param.data + (1.0-tau)*target_param.data)
            print(loss)
            R_save[n] = np.copy(R)
            N_moves_save[n] = np.copy(i)
                                       
            break
        
        
        
        
        # IF THE EPISODE IS NOT OVER...
        else:
            
            ## ONLY TO PUT SUMETHING
            PIPPO=1
            
            
        # NEXT STATE AND CO. BECOME ACTUAL STATE...     
        S=np.copy(S_next)
        X=np.copy(X_next)
        allowed_a=np.copy(allowed_a_next)
        
        i += 1  # UPDATE COUNTER FOR NUMBER OF ACTIONS
    
print('Random_Agent, Average reward:',np.mean(R_save),'Number of steps: ',np.mean(N_moves_save)
        
        
    
    

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_7256/2901104805.py, line 118)