# Laboratorio 7

## Integrantes

### Sergio Orellana - 221122

### Andre Marroquin - 22266

### Rodrigo Mansilla - 22611

# Link del repositorio

https://github.com/mar22266/LABORATORIOS-IA.git

# Link del video


# TASK 1

# Task 2

In [7]:
import math
import random
import copy
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers

# Parámetros del juego y del algoritmo Q-learning
ROW_COUNT = 6
COLUMN_COUNT = 7
EMPTY = 0

# Parámetros del Q-learning (valores por defecto)
ALPHA = 0.001       # Tasa de aprendizaje (para el optimizador de la red)
GAMMA = 0.95        # Factor de descuento
EPSILON = 1.0       # Tasa de exploración inicial
EPSILON_MIN = 0.1   # Valor mínimo de epsilon
EPSILON_DECAY = 0.995  # Factor de decaimiento de epsilon por episodio

# Parámetros del modelo
INPUT_SIZE = ROW_COUNT * COLUMN_COUNT * 3  # 126, por la representación one-hot
OUTPUT_SIZE = COLUMN_COUNT  # 7 acciones (columnas)

# ---------------- Modelo de Red Neuronal ---------------- #
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(128, input_dim=INPUT_SIZE, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(OUTPUT_SIZE, activation='linear'))  # Q-values para cada acción
    model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=ALPHA))
    return model

q_model = build_model()  # Modelo global

# ---------------- Funciones del juego ---------------- #
def create_board():
    return [[EMPTY for _ in range(COLUMN_COUNT)] for _ in range(ROW_COUNT)]

def drop_piece(board, row, col, piece):
    board[row][col] = piece

def is_valid_location(board, col):
    return board[0][col] == EMPTY

def get_next_open_row(board, col):
    for r in range(ROW_COUNT - 1, -1, -1):
        if board[r][col] == EMPTY:
            return r
    return None

def winning_move(board, piece):
    # Horizontal
    for r in range(ROW_COUNT):
        for c in range(COLUMN_COUNT - 3):
            if (board[r][c] == piece and board[r][c+1] == piece and 
                board[r][c+2] == piece and board[r][c+3] == piece):
                return True
    # Vertical
    for c in range(COLUMN_COUNT):
        for r in range(ROW_COUNT - 3):
            if (board[r][c] == piece and board[r+1][c] == piece and 
                board[r+2][c] == piece and board[r+3][c] == piece):
                return True
    # Diagonal positiva
    for r in range(3, ROW_COUNT):
        for c in range(COLUMN_COUNT - 3):
            if (board[r][c] == piece and board[r-1][c+1] == piece and 
                board[r-2][c+2] == piece and board[r-3][c+3] == piece):
                return True
    # Diagonal negativa
    for r in range(ROW_COUNT - 3):
        for c in range(COLUMN_COUNT - 3):
            if (board[r][c] == piece and board[r+1][c+1] == piece and 
                board[r+2][c+2] == piece and board[r+3][c+3] == piece):
                return True
    return False

def get_winning_positions(board, piece):
    for r in range(ROW_COUNT):
        for c in range(COLUMN_COUNT - 3):
            if (board[r][c] == piece and board[r][c+1] == piece and 
                board[r][c+2] == piece and board[r][c+3] == piece):
                return [(r, c + i) for i in range(4)]
    for c in range(COLUMN_COUNT):
        for r in range(ROW_COUNT - 3):
            if (board[r][c] == piece and board[r+1][c] == piece and 
                board[r+2][c] == piece and board[r+3][c] == piece):
                return [(r + i, c) for i in range(4)]
    for r in range(3, ROW_COUNT):
        for c in range(COLUMN_COUNT - 3):
            if (board[r][c] == piece and board[r-1][c+1] == piece and 
                board[r-2][c+2] == piece and board[r-3][c+3] == piece):
                return [(r - i, c + i) for i in range(4)]
    for r in range(ROW_COUNT - 3):
        for c in range(COLUMN_COUNT - 3):
            if (board[r][c] == piece and board[r+1][c+1] == piece and 
                board[r+2][c+2] == piece and board[r+3][c+3] == piece):
                return [(r + i, c + i) for i in range(4)]
    return []

def get_valid_locations(board):
    valid = []
    for col in range(COLUMN_COUNT):
        if is_valid_location(board, col):
            valid.append(col)
    return valid

def is_terminal_node(board):
    return winning_move(board, 1) or winning_move(board, 2) or len(get_valid_locations(board)) == 0

# Representación del estado: one-hot
def get_state_one_hot(board):
    mapping = {0: [1, 0, 0], 1: [0, 1, 0], 2: [0, 0, 1]}
    state = []
    for row in board:
        for cell in row:
            state.extend(mapping[cell])
    return np.array(state)

def print_board(board, winning_positions=[]):
    for r in range(ROW_COUNT):
        row_str = ""
        for c in range(COLUMN_COUNT):
            cell = board[r][c]
            if (r, c) in winning_positions:
                row_str += "\033[91m" + str(cell) + "\033[0m" + " "
            else:
                row_str += str(cell) + " "
        print(row_str)
    print("")

# ---------------- Agente TD Learning con Modelo ---------------- #
def choose_action(state, board, epsilon):
    """
    Selecciona una acción usando política ε-greedy:
      - Con probabilidad epsilon, elige una acción aleatoria de las válidas.
      - Con probabilidad (1 - epsilon), elige la acción con mayor Q-value (filtrado por válidas).
    """
    valid_actions = get_valid_locations(board)
    if np.random.rand() < epsilon:
        return random.choice(valid_actions)
    state_input = state.reshape(1, INPUT_SIZE)
    q_values = q_model.predict(state_input, verbose=0)[0]
    q_valid = {action: q_values[action] for action in valid_actions}
    return max(q_valid, key=q_valid.get)

def update_Q(state, action, reward, next_state, done):
    """
    Actualiza el modelo usando un paso de Q-learning:
      - Calcula el target Q-value para la acción tomada.
      - Entrena la red para minimizar el error entre el Q-value predicho y el target.
    """
    state_input = state.reshape(1, INPUT_SIZE)
    next_state_input = next_state.reshape(1, INPUT_SIZE)
    q_values = q_model.predict(state_input, verbose=0)
    q_next = q_model.predict(next_state_input, verbose=0)
    target = q_values.copy()
    if done:
        target[0][action] = reward
    else:
        target[0][action] = reward + GAMMA * np.max(q_next)
    q_model.fit(state_input, target, epochs=1, verbose=0)

# ---------------- Ciclo de Entrenamiento ---------------- #
def train_agent(episodes):
    global EPSILON, current_board
    for ep in range(episodes):
        board = create_board()
        current_board = board
        game_over = False
        turn = 0
        total_reward = 0
        while not game_over:
            state = get_state_one_hot(board)
            valid_actions = get_valid_locations(board)
            if len(valid_actions) == 0:
                game_over = True
                break
            # En self-play, ambos jugadores usan la misma política.
            action = choose_action(state, board, EPSILON)
            row = get_next_open_row(board, action)
            # Alterna turno: turno 0 -> pieza 1, turno 1 -> pieza 2
            piece = 1 if turn == 0 else 2
            drop_piece(board, row, action, piece)
            # Estructura de recompensas:
            # Si gana: +100 para la pieza 2 (agente) y -100 para la pieza 1.
            # Si es empate: 0.
            # Si no termina: -1 por movimiento.
            if winning_move(board, piece):
                reward = 100 if piece == 2 else -100
                game_over = True
            elif is_terminal_node(board):
                reward = 0
                game_over = True
            else:
                reward = -1
            total_reward += reward
            next_state = get_state_one_hot(board)
            update_Q(state, action, reward, next_state, game_over)
            turn = (turn + 1) % 2
            current_board = board
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY
        print(f"Episode {ep+1}/{episodes} - Total Reward: {total_reward} - Epsilon: {EPSILON:.3f}")

# ---------------- Fine Tuning ---------------- #
def fine_tuning():
    global ALPHA, GAMMA, EPSILON, EPSILON_DECAY, q_model
    print("Ajuste de parámetros del algoritmo TD Learning")
    new_alpha = input(f"Ingrese la tasa de aprendizaje (ALPHA) actual ({ALPHA}): ").strip()
    if new_alpha != "":
        ALPHA = float(new_alpha)
        # Reconstruir el modelo con el nuevo ALPHA
        q_model = build_model()
    new_gamma = input(f"Ingrese el factor de descuento (GAMMA) actual ({GAMMA}): ").strip()
    if new_gamma != "":
        GAMMA = float(new_gamma)
    new_epsilon = input(f"Ingrese la tasa de exploración (EPSILON) actual ({EPSILON}): ").strip()
    if new_epsilon != "":
        EPSILON = float(new_epsilon)
    new_decay = input(f"Ingrese el factor de decaimiento (EPSILON_DECAY) actual ({EPSILON_DECAY}): ").strip()
    if new_decay != "":
        EPSILON_DECAY = float(new_decay)
    print("Nuevos parámetros:")
    print(f"ALPHA = {ALPHA}, GAMMA = {GAMMA}, EPSILON = {EPSILON}, EPSILON_DECAY = {EPSILON_DECAY}")

# ---------------- Modo de Evaluación: Jugar contra el agente entrenado ---------------- #
def play_game():
    board = create_board()
    game_over = False
    turn = 0
    print_board(board)
    while not game_over:
        if turn == 0:
            valid_cols = get_valid_locations(board)
            col = -1
            while col not in valid_cols:
                try:
                    col = int(input(f"Col (0-{COLUMN_COUNT-1}): "))
                    if col not in valid_cols:
                        print("No válido, intente otra vez.")
                except ValueError:
                    print("Entrada inválida.")
            row = get_next_open_row(board, col)
            drop_piece(board, row, col, 1)
            if winning_move(board, 1):
                winning_pos = get_winning_positions(board, 1)
                print_board(board, winning_pos)
                print("¡Ganaste!")
                game_over = True
            else:
                print_board(board)
        else:
            state = get_state_one_hot(board)
            col = choose_action(state, board, 0.0)  # Sin exploración
            if col not in get_valid_locations(board):
                col = random.choice(get_valid_locations(board))
            row = get_next_open_row(board, col)
            drop_piece(board, row, col, 2)
            if winning_move(board, 2):
                winning_pos = get_winning_positions(board, 2)
                print_board(board, winning_pos)
                print("IA gana.")
                game_over = True
            else:
                print_board(board)
        turn = (turn + 1) % 2

# ---------------- Opciones de Ejecución ---------------- #
def ejecutar():
    mode_input = input("Elija 'train' para entrenamiento, 'play' para jugar, o 'tune' para ajustar parámetros: ").strip().lower()
    if mode_input == "train":
        episodes = int(input("Ingrese el número de episodios para entrenar: "))
        train_agent(episodes)
    elif mode_input == "play":
        play_game()
    elif mode_input == "tune":
        fine_tuning()
    else:
        print("Opción no válida.")

# Variable global para acceso al tablero actual en choose_action (solo para este ejemplo)
current_board = None

ejecutar()


Ajuste de parámetros del algoritmo TD Learning
Nuevos parámetros:
ALPHA = 0.005, GAMMA = 0.9, EPSILON = 1.0, EPSILON_DECAY = 0.995
