<a href="https://colab.research.google.com/github/neuromatch/NeuroAI_Course/blob/main/tutorials/W2D5_Mysteries/student/W2D5_Tutorial3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> &nbsp; <a href="https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/neuromatch/NeuroAI_Course/main/tutorials/W2D5_Mysteries/student/W2D5_Tutorial3.ipynb"  target="_parent"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open in Kaggle"/></a>

# (Bonus) Tutorial 3: Consciousness (Extended)

**Week 2, Day 5: Mysteries**

**By Neuromatch Academy**

__Content creators:__ Steve Fleming, Guillaume Dumas, Samuele Bolotta, Juan David Vargas, Hakwan Lau, Anil Seth, Megan Peters

__Content reviewers:__ Samuele Bolotta, Lily Chamakura, RyeongKyung Yoon, Yizhou Chen, Ruiyi Zhang, Patrick Mineault, Alex Murphy

__Production editors:__ Konstantine Tsafatinos, Ella Batty, Spiros Chavlis, Samuele Bolotta, Hlib Solodzhuk, Patrick Mineault, Alex Murphy


In [None]:
# @title Install and import feedback gadget

!pip install vibecheck numpy matplotlib Pillow torch torchvision transformers ipywidgets gradio trdg scikit-learn networkx pickleshare seaborn tabulate --quiet

from vibecheck import DatatopsContentReviewContainer
def content_review(notebook_section: str):
    return DatatopsContentReviewContainer(
        "",  # No text prompt
        notebook_section,
        {
            "url": "https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab",
            "name": "neuromatch_neuroai",
            "user_key": "wb2cxze8",
        },
    ).render()

feedback_prefix = "W2D5_T3"

In [None]:
# @title Import dependencies
# @markdown

import contextlib
import io

with contextlib.redirect_stdout(io.StringIO()):
    # Standard Libraries
    import copy
    import logging
    import os
    import random
    import requests

    # Data Handling and Visualization Libraries
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import precision_score, recall_score, fbeta_score
    from sklearn.linear_model import LinearRegression
    from tabulate import tabulate

    # Scientific Computing and Statistical Libraries
    from numpy.linalg import inv
    from scipy.special import logsumexp
    from scipy.stats import multivariate_normal

    # Deep Learning Libraries
    import torch
    from torch import nn, optim, save, load
    from torch.nn import functional as F
    from torch.utils.data import DataLoader
    import torch.nn.init as init
    from torch.optim.lr_scheduler import StepLR

    # Image Processing Libraries
    from PIL import Image
    from matplotlib.patches import Patch
    from mpl_toolkits.mplot3d import Axes3D

    # Interactive Elements and Web Applications
    from IPython.display import IFrame
    from IPython.display import Image as IMG
    import gradio as gr
    import ipywidgets as widgets
    from ipywidgets import interact, IntSlider

    # Graph Analysis Libraries
    import networkx as nx

    # Progress Monitoring Libraries
    from tqdm import tqdm

    # Utilities and Miscellaneous Libraries
    from itertools import product

    import math
    !pip install torch_optimizer
    import torch_optimizer as optim2

In [None]:
# @title Figure settings
# @markdown

logging.getLogger('matplotlib.font_manager').disabled = True

%matplotlib inline
%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle")

In [None]:
# @title Set device (GPU or CPU)

def set_device():
    """
    Determines and sets the computational device for PyTorch operations based on the availability of a CUDA-capable GPU.

    Outputs:
    - device (str): The device that PyTorch will use for computations ('cuda' or 'cpu'). This string can be directly used
    in PyTorch operations to specify the device.
    """

    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device != "cuda":
        print("GPU is not enabled in this notebook. \n"
              "If you want to enable it, in the menu under `Runtime` -> \n"
              "`Hardware accelerator.` and select `GPU` from the dropdown menu")
    else:
        print("GPU is enabled in this notebook. \n"
              "If you want to disable it, in the menu under `Runtime` -> \n"
              "`Hardware accelerator.` and select `None` from the dropdown menu")

    return device

device = set_device()

In [None]:
# @title Helper functions

mse_loss = nn.BCELoss(size_average = False)

lam = 1e-4

from torch.autograd import Variable

def CAE_loss(W, x, recons_x, h, lam):
    """Compute the Contractive AutoEncoder Loss

    Evalutes the CAE loss, which is composed as the summation of a Mean
    Squared Error and the weighted l2-norm of the Jacobian of the hidden
    units with respect to the inputs.


    See reference below for an in-depth discussion:
      #1: http://wiseodd.github.io/techblog/2016/12/05/contractive-autoencoder

    Args:
        `W` (FloatTensor): (N_hidden x N), where N_hidden and N are the
          dimensions of the hidden units and input respectively.
        `x` (Variable): the input to the network, with dims (N_batch x N)
        recons_x (Variable): the reconstruction of the input, with dims
          N_batch x N.
        `h` (Variable): the hidden units of the network, with dims
          batch_size x N_hidden
        `lam` (float): the weight given to the jacobian regulariser term

    Returns:
        Variable: the (scalar) CAE loss
    """
    mse = mse_loss(recons_x, x)
    # Since: W is shape of N_hidden x N. So, we do not need to transpose it as
    # opposed to #1
    dh = h * (1 - h) # Hadamard product produces size N_batch x N_hidden
    # Sum through the input dimension to improve efficiency, as suggested in #1
    w_sum = torch.sum(Variable(W)**2, dim=1)
    # unsqueeze to avoid issues with torch.mv
    w_sum = w_sum.unsqueeze(1) # shape N_hidden x 1
    contractive_loss = torch.sum(torch.mm(dh**2, w_sum), 0)
    return mse + contractive_loss.mul_(lam)

class FirstOrderNetwork(nn.Module):
    def __init__(self, hidden_units, data_factor, use_gelu):
        """
        Initializes the FirstOrderNetwork with specific configurations.

        Parameters:
        - hidden_units (int): The number of units in the hidden layer.
        - data_factor (int): Factor to scale the amount of data processed.
                             A factor of 1 indicates the default data amount,
                             while 10 indicates 10 times the default amount.
        - use_gelu (bool): Flag to use GELU (True) or ReLU (False) as the activation function.
        """
        super(FirstOrderNetwork, self).__init__()

        # Define the encoder, hidden, and decoder layers with specified units

        self.fc1 = nn.Linear(100, hidden_units, bias = False) # Encoder
        self.hidden= nn.Linear(hidden_units, hidden_units, bias = False) # Hidden
        self.fc2 = nn.Linear(hidden_units, 100, bias = False) # Decoder

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()


        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(0.1)

        # Set the data factor
        self.data_factor = data_factor

        # Other activation functions for various purposes
        self.softmax = nn.Softmax()

        # Initialize network weights
        self.initialize_weights()

    def initialize_weights(self):
        """Initializes weights of the encoder, hidden, and decoder layers uniformly."""
        init.uniform_(self.fc1.weight, -1.0, 1.0)
        init.uniform_(self.fc2.weight, -1.0, 1.0)
        init.uniform_(self.hidden.weight, -1.0, 1.0)

    def encoder(self, x):
      h1 = self.dropout(self.relu(self.fc1(x.view(-1, 100))))
      return h1

    def decoder(self,z):
      #h2 = self.relu(self.hidden(z))
      h2 = self.sigmoid(self.fc2(z))
      return h2

    def forward(self, x):
      """
      Defines the forward pass through the network.

      Parameters:
      - x (Tensor): The input tensor to the network.

      Returns:
      - Tensor: The output of the network after passing through the layers and activations.
      """
      h1 = self.encoder(x)
      h2 = self.decoder(h1)

      return h1 , h2

class SecondOrderNetwork(nn.Module):
    def __init__(self, use_gelu):
        super(SecondOrderNetwork, self).__init__()
        # Define a linear layer for comparing the difference between input and output of the first-order network
        self.comparison_layer = nn.Linear(100, 100)

        # Linear layer for determining wagers, mapping from 100 features to a single output
        self.wager = nn.Linear(100, 1)

        # Dropout layer to prevent overfitting by randomly setting input units to 0 with a probability of 0.5 during training
        self.dropout = nn.Dropout(0.5)

        # Select activation function based on the `use_gelu` flag
        self.activation = torch.relu

        # Additional activation functions for potential use in network operations
        self.sigmoid = torch.sigmoid

        self.softmax = nn.Softmax()

        # Initialize the weights of the network
        self._init_weights()

    def _init_weights(self):
        # Uniformly initialize weights for the comparison and wager layers
        init.uniform_(self.comparison_layer.weight, -1.0, 1.0)
        init.uniform_(self.wager.weight, 0.0, 0.1)

    def forward(self, first_order_input, first_order_output):
        # Calculate the difference between the first-order input and output
        comparison_matrix = first_order_input - first_order_output

        #Another option is to directly calculate the per unit MSE to use as input for the comparator matrix
        #comparison_matrix = nn.MSELoss(reduction='none')(first_order_output, first_order_input)

        # Pass the difference through the comparison layer and apply the chosen activation function
        comparison_out=self.dropout(self.activation(self.comparison_layer(comparison_matrix)))

        # Calculate the wager value, applying dropout and sigmoid activation to the output of the wager layer
        wager = self.sigmoid(self.wager(comparison_out))

        return wager

def initialize_global():
    global Input_Size_1, Hidden_Size_1, Output_Size_1, Input_Size_2
    global num_units, patterns_number
    global learning_rate_2, momentum, temperature , Threshold
    global First_set, Second_set, Third_set
    global First_set_targets, Second_set_targets, Third_set_targets
    global epoch_list, epoch_1_order, epoch_2_order, patterns_matrix1
    global testing_graph_names

    global optimizer ,n_epochs , learning_rate_1
    learning_rate_1 = 0.5
    n_epochs = 100
    optimizer="ADAMAX"

    # Network sizes
    Input_Size_1 = 100
    Hidden_Size_1 = 60
    Output_Size_1 = 100
    Input_Size_2 = 100

    # Patterns
    num_units = 100
    patterns_number = 200

    # Pre-training and hyperparameters
    learning_rate_2 = 0.1
    momentum = 0.9
    temperature = 1.0
    Threshold=0.5

    # Testing
    First_set = []
    Second_set = []
    Third_set = []
    First_set_targets = []
    Second_set_targets = []
    Third_set_targets = []

    # Graphic of pretraining
    epoch_list = list(range(1, n_epochs + 1))
    epoch_1_order = np.zeros(n_epochs)
    epoch_2_order = np.zeros(n_epochs)
    patterns_matrix1 =  torch.zeros((n_epochs, patterns_number), device=device)  # Initialize patterns_matrix as a PyTorch tensor on the GPU

def compute_metrics(TP, TN, FP, FN):
    """Compute precision, recall, F1 score, and accuracy."""
    precision = round(TP / (TP + FP), 2) if (TP + FP) > 0 else 0
    recall = round(TP / (TP + FN), 2) if (TP + FN) > 0 else 0
    f1_score = round(2 * (precision * recall) / (precision + recall), 2) if (precision + recall) > 0 else 0
    accuracy = round((TP + TN) / (TP + TN + FP + FN), 2) if (TP + TN + FP + FN) > 0 else 0
    return precision, recall, f1_score, accuracy

# define the architecture, optimizers, loss functions, and schedulers for pre training
def prepare_pre_training(hidden,factor,gelu,stepsize, gam):

  first_order_network = FirstOrderNetwork(hidden, factor, gelu).to(device)
  second_order_network = SecondOrderNetwork(gelu).to(device)

  criterion_1 = CAE_loss
  criterion_2 = nn.BCELoss(size_average = False)


  if optimizer == "ADAM":
    optimizer_1 = optim.Adam(first_order_network.parameters(), lr=learning_rate_1)
    optimizer_2 = optim.Adam(second_order_network.parameters(), lr=learning_rate_2)

  elif optimizer == "SGD":
    optimizer_1 = optim.SGD(first_order_network.parameters(), lr=learning_rate_1)
    optimizer_2 = optim.SGD(second_order_network.parameters(), lr=learning_rate_2)

  elif optimizer == "SWATS":
    optimizer_1 = optim2.SWATS(first_order_network.parameters(), lr=learning_rate_1)
    optimizer_2 = optim2.SWATS(second_order_network.parameters(), lr=learning_rate_2)

  elif optimizer == "ADAMW":
    optimizer_1 = optim.AdamW(first_order_network.parameters(), lr=learning_rate_1)
    optimizer_2 = optim.AdamW(second_order_network.parameters(), lr=learning_rate_2)

  elif optimizer == "RMS":
    optimizer_1 = optim.RMSprop(first_order_network.parameters(), lr=learning_rate_1)
    optimizer_2 = optim.RMSprop(second_order_network.parameters(), lr=learning_rate_2)

  elif optimizer == "ADAMAX":
    optimizer_1 = optim.Adamax(first_order_network.parameters(), lr=learning_rate_1)
    optimizer_2 = optim.Adamax(second_order_network.parameters(), lr=learning_rate_2)

  # Learning rate schedulers
  scheduler_1 = StepLR(optimizer_1, step_size=stepsize, gamma=gam)
  scheduler_2 = StepLR(optimizer_2, step_size=stepsize, gamma=gam)

  return first_order_network, second_order_network, criterion_1 , criterion_2, optimizer_1, optimizer_2, scheduler_1, scheduler_2

def title(string):
    # Enable XKCD plot styling
    with plt.xkcd():
        # Create a figure and an axes.
        fig, ax = plt.subplots()

        # Create a rectangle patch with specified dimensions and styles
        rectangle = patches.Rectangle((0.05, 0.1), 0.9, 0.4, linewidth=1, edgecolor='r', facecolor='blue', alpha=0.5)
        ax.add_patch(rectangle)

        # Place text inside the rectangle, centered
        plt.text(0.5, 0.3, string, horizontalalignment='center', verticalalignment='center', fontsize=26, color='white')

        # Set plot limits
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)

        # Disable axis display
        ax.axis('off')

        # Display the plot
        plt.show()

        # Close the figure to free up memory
        plt.close(fig)

# Function to configure the training environment and load the models
def get_test_patterns(factor):
    """
    Configures the training environment by saving the state of the given models and loading them back.
    Initializes testing patterns for evaluation.

    Returns:
    - Tuple of testing patterns, number of samples in the testing patterns
    """
    # Generating testing patterns for three different sets
    first_set, first_set_targets = create_patterns(0,factor)
    second_set, second_set_targets = create_patterns(1,factor)
    third_set, third_set_targets = create_patterns(2,factor)

    # Aggregate testing patterns and their targets for ease of access
    testing_patterns = [[first_set, first_set_targets], [second_set, second_set_targets], [third_set, third_set_targets]]

    # Determine the number of samples from the first set (assumed consistent across all sets)
    n_samples = len(testing_patterns[0][0])

    return testing_patterns, n_samples

# Function to test the model using the configured testing patterns
def plot_input_output(input_data, output_data, index):
    fig, axes = plt.subplots(1, 2, figsize=(10, 6))

    # Plot input data
    im1 = axes[0].imshow(input_data.cpu().numpy(), aspect='auto', cmap='viridis')
    axes[0].set_title('Input')
    fig.colorbar(im1, ax=axes[0])

    # Plot output data
    im2 = axes[1].imshow(output_data.cpu().numpy(), aspect='auto', cmap='viridis')
    axes[1].set_title('Output')
    fig.colorbar(im2, ax=axes[1])

    plt.suptitle(f'Testing Pattern {index+1}')
    plt.show()

# Function to test the model using the configured testing patterns
# Function to test the model using the configured testing patterns
def testing(testing_patterns, n_samples, loaded_model, loaded_model_2,factor):

    def generate_chance_level(shape):
      chance_level = np.random.rand(*shape).tolist()
      return chance_level

    results_for_plotting = []
    max_values_output_first_order = []
    max_indices_output_first_order = []
    max_values_patterns_tensor = []
    max_indices_patterns_tensor = []
    f1_scores_wager = []

    mse_losses_indices = []
    mse_losses_values = []
    discrimination_performances = []



    # Iterate through each set of testing patterns and targets
    for i in range(len(testing_patterns)):
        with torch.no_grad():  # Ensure no gradients are computed during testing

            #For low vision the stimulus threshold was set to 0.3 as can seen in the generate_patters function
            threshold=0.5
            if i==2:
                threshold=0.15

            # Obtain output from the first order model
            input_data = testing_patterns[i][0]
            hidden_representation,  output_first_order = loaded_model(input_data)
            output_second_order = loaded_model_2(input_data, output_first_order)

            delta=100*factor

            print("driscriminator")
            print((output_first_order[delta:].argmax(dim=1) == input_data[delta:].argmax(dim=1)).to(float).mean())
            discrimination_performance = round((output_first_order[delta:].argmax(dim=1) == input_data[delta:].argmax(dim=1)).to(float).mean().item(), 2)
            discrimination_performances.append(discrimination_performance)


            chance_level = torch.Tensor( generate_chance_level((200*factor,100))).to(device)
            discrimination_random= round((chance_level[delta:].argmax(dim=1) == input_data[delta:].argmax(dim=1)).to(float).mean().item(), 2)
            print("chance level" , discrimination_random)



            #count all patterns in the dataset
            wagers = output_second_order[delta:].cpu()

            _, targets_2 = torch.max(testing_patterns[i][1], 1)
            targets_2 = targets_2[delta:].cpu()

            # Convert targets to binary classification for wagering scenario
            targets_2 = (targets_2 > 0).int()

            # Convert tensors to NumPy arrays for metric calculations
            predicted_np = wagers.numpy().flatten()
            targets_2_np = targets_2.numpy()

            #print("number of targets," , len(targets_2_np))

            print(predicted_np)
            print(targets_2_np)

            # Calculate True Positives, True Negatives, False Positives, and False Negatives
            TP = np.sum((predicted_np >  threshold) & (targets_2_np > threshold))
            TN = np.sum((predicted_np <  threshold ) & (targets_2_np < threshold))
            FP = np.sum((predicted_np >  threshold) & (targets_2_np <  threshold))
            FN = np.sum((predicted_np <  threshold) & (targets_2_np >  threshold))

            # Compute precision, recall, F1 score, and accuracy for both high and low wager scenarios
            precision_h, recall_h, f1_score_h, accuracy_h = compute_metrics(TP, TN, FP, FN)

            f1_scores_wager.append(f1_score_h)

            # Collect results for plotting
            results_for_plotting.append({
                "counts": [[TP, FP, TP + FP]],
                "metrics": [[precision_h, recall_h, f1_score_h, accuracy_h]],
                "title_results": f"Results Table - Set {i+1}",
                "title_metrics": f"Metrics Table - Set {i+1}"
            })

            # Plot input and output of the first-order network
            plot_input_output(input_data, output_first_order, i)

            max_vals_out, max_inds_out = torch.max(output_first_order[100:], dim=1)
            max_inds_out[max_vals_out == 0] = 0
            max_values_output_first_order.append(max_vals_out.tolist())
            max_indices_output_first_order.append(max_inds_out.tolist())

            max_vals_pat, max_inds_pat = torch.max(input_data[100:], dim=1)
            max_inds_pat[max_vals_pat == 0] = 0
            max_values_patterns_tensor.append(max_vals_pat.tolist())
            max_indices_patterns_tensor.append(max_inds_pat.tolist())

            fig, axs = plt.subplots(1, 2, figsize=(15, 5))

            # Scatter plot of indices: patterns_tensor vs. output_first_order
            axs[0].scatter(max_indices_patterns_tensor[i], max_indices_output_first_order[i], alpha=0.5)
            axs[0].set_title(f'Stimuli location: Condition {i+1} - First Order Input vs. First Order Output')
            axs[0].set_xlabel('First Order Input Indices')
            axs[0].set_ylabel('First Order Output Indices')

            # Add quadratic fit to scatter plot
            x_indices = max_indices_patterns_tensor[i]
            y_indices = max_indices_output_first_order[i]
            y_pred_indices = perform_quadratic_regression(x_indices, y_indices)
            axs[0].plot(x_indices, y_pred_indices, color='skyblue')


            # Calculate MSE loss for indices
            mse_loss_indices = np.mean((np.array(x_indices) - np.array(y_indices)) ** 2)
            mse_losses_indices.append(mse_loss_indices)

            # Scatter plot of values: patterns_tensor vs. output_first_order
            axs[1].scatter(max_values_patterns_tensor[i], max_values_output_first_order[i], alpha=0.5)
            axs[1].set_title(f'Stimuli Values: Condition {i+1} - First Order Input vs. First Order Output')
            axs[1].set_xlabel('First Order Input Values')
            axs[1].set_ylabel('First Order Output Values')

            # Add quadratic fit to scatter plot
            x_values = max_values_patterns_tensor[i]
            y_values = max_values_output_first_order[i]
            y_pred_values = perform_quadratic_regression(x_values, y_values)
            axs[1].plot(x_values, y_pred_values, color='skyblue')

            # Calculate MSE loss for values
            mse_loss_values = np.mean((np.array(x_values) - np.array(y_values)) ** 2)
            mse_losses_values.append(mse_loss_values)

            plt.tight_layout()
            plt.show()

    return f1_scores_wager, mse_losses_indices , mse_losses_values, discrimination_performances, results_for_plotting

def generate_patterns(patterns_number, num_units, factor, condition = 0):
    """
    Generates patterns and targets for training the networks

    # patterns_number: Number of patterns to generate
    # num_units: Number of units in each pattern
    # pattern: 0: superthreshold, 1: subthreshold, 2: low vision
    # Returns lists of patterns, stimulus present/absent indicators, and second order targets
    """

    patterns_number= patterns_number*factor

    patterns = []  # Store generated patterns
    stim_present = []  # Indicators for when a stimulus is present in the pattern
    stim_absent = []  # Indicators for when no stimulus is present
    order_2_pr = []  # Second order network targets based on the presence or absence of stimulus

    if condition == 0:
        random_limit= 0.0
        baseline = 0
        multiplier = 1

    if condition == 1:
        random_limit= 0.02
        baseline = 0.0012
        multiplier = 1

    if condition == 2:
        random_limit= 0.02
        baseline = 0.0012
        multiplier = 0.3

    # Generate patterns, half noise and half potential stimuli
    for i in range(patterns_number):

        # First half: Noise patterns
        if i < patterns_number // 2:

            pattern = multiplier * np.random.uniform(0.0, random_limit, num_units) + baseline # Generate a noise pattern
            patterns.append(pattern)
            stim_present.append(np.zeros(num_units))  # Stimulus absent
            order_2_pr.append([0.0 , 1.0])  # No stimulus, low wager

        # Second half: Stimulus patterns
        else:
            stimulus_number = random.randint(0, num_units - 1) # Choose a unit for potential stimulus
            pattern = np.random.uniform(0.0, random_limit, num_units) + baseline
            pattern[stimulus_number] = np.random.uniform(0.0, 1.0) * multiplier   # Set stimulus intensity

            patterns.append(pattern)
            present = np.zeros(num_units)
            # Determine if stimulus is above discrimination threshold
            if pattern[stimulus_number] >= multiplier/2:
                order_2_pr.append([1.0 , 0.0])  # Stimulus detected, high wager
                present[stimulus_number] = 1.0
            else:
                order_2_pr.append([0.0 , 1.0])  # Stimulus not detected, low wager
                present[stimulus_number] = 0.0

            stim_present.append(present)


    patterns_tensor = torch.Tensor(patterns).to(device).requires_grad_(True)
    stim_present_tensor = torch.Tensor(stim_present).to(device).requires_grad_(True)
    stim_absent_tensor= torch.Tensor(stim_absent).to(device).requires_grad_(True)
    order_2_tensor = torch.Tensor(order_2_pr).to(device).requires_grad_(True)

    return patterns_tensor, stim_present_tensor, stim_absent_tensor, order_2_tensor

def create_patterns(stimulus,factor):
    """
    Generates neural network input patterns based on specified stimulus conditions.

    Parameters:
    - stimulus (int): Determines the type of patterns to generate.
                      Acceptable values:
                      - 0: Suprathreshold stimulus
                      - 1: Subthreshold stimulus
                      - 2: Low vision condition

    Returns:
    - torch.Tensor: Tensor of generated patterns.
    - torch.Tensor: Tensor of target values corresponding to the generated patterns.
    """

    # Generate initial patterns and target tensors for base condition.

    patterns_tensor, stim_present_tensor, _, _ = generate_patterns(patterns_number, num_units ,factor, stimulus)
    # Convert pattern tensors for processing on specified device (CPU/GPU).
    patterns = torch.Tensor(patterns_tensor).to(device)
    targets = torch.Tensor(stim_present_tensor).to(device)

    return patterns, targets

def pre_train(first_order_network, second_order_network, criterion_1,  criterion_2, optimizer_1, optimizer_2, scheduler_1, scheduler_2, factor, meta):
    """
    Conducts pre-training for first-order and second-order networks.

    Parameters:
    - first_order_network (torch.nn.Module): Network for basic input-output mapping.
    - second_order_network (torch.nn.Module): Network for decision-making based on the first network's output.
    - criterion_1, criterion_2 (torch.nn): Loss functions for the respective networks.
    - optimizer_1, optimizer_2 (torch.optim): Optimizers for the respective networks.
    - scheduler_1, scheduler_2 (torch.optim.lr_scheduler): Schedulers for learning rate adjustment.
    - factor (float): Parameter influencing data augmentation or pattern generation.
    - meta (bool): Flag indicating the use of meta-learning strategies.

    Returns:
    Tuple containing updated networks and epoch-wise loss records.

    """
    def get_num_args(func):
      return func.__code__.co_argcount

    max_values_output_first_order = []
    max_indices_output_first_order = []
    max_values_patterns_tensor = []
    max_indices_patterns_tensor = []

    epoch_1_order = np.zeros(n_epochs)
    epoch_2_order = np.zeros(n_epochs)

    for epoch in range(n_epochs):
        # Generate training patterns and targets for each epoch
        patterns_tensor, stim_present_tensor, stim_absent_tensor, order_2_tensor = generate_patterns(patterns_number, num_units,factor, 0)

        # Forward pass through the first-order network
        hidden_representation , output_first_order = first_order_network(patterns_tensor)

        patterns_tensor=patterns_tensor.requires_grad_(True)
        output_first_order=output_first_order.requires_grad_(True)

        # Get max values and indices for output_first_order
        max_vals_out, max_inds_out = torch.max(output_first_order[100:], dim=1)
        max_inds_out[max_vals_out == 0] = 0
        max_values_output_first_order.append(max_vals_out.tolist())
        max_indices_output_first_order.append(max_inds_out.tolist())

        # Get max values and indices for patterns_tensor
        max_vals_pat, max_inds_pat = torch.max(patterns_tensor[100:], dim=1)
        max_inds_pat[max_vals_pat == 0] = 0
        max_values_patterns_tensor.append(max_vals_pat.tolist())
        max_indices_patterns_tensor.append(max_inds_pat.tolist())

        optimizer_1.zero_grad()

        # Conditionally execute the second-order network pass and related operations
        if meta:

            # Forward pass through the second-order network with inputs from the first-order network
            output_second_order = second_order_network(patterns_tensor, output_first_order)

            # Calculate the loss for the second-order network (wagering decision based on comparison)
            loss_2 = criterion_2(output_second_order.squeeze(), order_2_tensor[:, 0])

            optimizer_2.zero_grad()


            # Backpropagate the second-order network's loss
            loss_2.backward(retain_graph=True)  # Allows further backpropagation for loss_1 after loss_2

            # Update second-order network weights
            optimizer_2.step()

            scheduler_2.step()

            epoch_2_order[epoch] = loss_2.item()
        else:
            # Skip computations for the second-order network
            with torch.no_grad():
                # Potentially forward pass through the second-order network without tracking gradients
                output_second_order = second_order_network(patterns_tensor, output_first_order)

        # Calculate the loss for the first-order network (accuracy of stimulus representation)

        num_args = get_num_args(criterion_1)

        if num_args == 2:
          loss_1 = criterion_1(  output_first_order , stim_present_tensor )
        else:
          W = first_order_network.state_dict()['fc1.weight']
          loss_1 = criterion_1( W, stim_present_tensor.view(-1, 100), output_first_order,
                             hidden_representation, lam )

        # Backpropagate the first-order network's loss
        loss_1.backward()

        # Update first-order network weights
        optimizer_1.step()

        # Reset first-order optimizer gradients to zero for the next iteration

        # Update the first-order scheduler
        scheduler_1.step()

        epoch_1_order[epoch] = loss_1.item()
        #epoch_1_order[epoch] = loss_location.item()

    return first_order_network, second_order_network, epoch_1_order, epoch_2_order , (max_values_output_first_order[-1],
            max_indices_output_first_order[-1],
            max_values_patterns_tensor[-1],
            max_indices_patterns_tensor[-1])

def HOSS_evaluate(X, mu, Sigma, Aprior, Wprior):
    """
    Inference on 2D Bayes net for asymmetric inference on presence vs. absence.
    """

    # Initialise variables and conditional prob tables
    p_A = np.array([1 - Aprior, Aprior])  # prior on awareness state A
    p_W_a1 = np.append(0, Wprior)  # likelihood of world states W given aware, first entry is absence
    p_W_a0 = np.append(1, np.zeros(len(Wprior)))  # likelihood of world states W given unaware, first entry is absence
    p_W = (p_W_a1 + p_W_a0) / 2  # prior on W marginalising over A (for KL)

    # Compute likelihood of observed X for each possible W (P(X|mu_w, Sigma))
    lik_X_W = np.array([multivariate_normal.pdf(X, mean=mu_i, cov=Sigma) for mu_i in mu])
    p_X_W = lik_X_W / lik_X_W.sum()  # normalise to get P(X|W)

    # Combine with likelihood of each world state w given awareness state A
    lik_W_A = np.vstack((p_X_W * p_W_a0 * p_A[0], p_X_W * p_W_a1 * p_A[1]))
    post_A = lik_W_A.sum(axis=1)  # sum over W
    post_A = post_A / post_A.sum()  # normalise

    # Posterior over W (P(W|X=x) marginalising over A)
    post_W = lik_W_A.sum(axis=0)  # sum over A
    post_W = post_W / post_W.sum()  # normalise

    # KL divergences
    KL_W = (post_W * np.log(post_W / p_W)).sum()
    KL_A = (post_A * np.log(post_A / p_A)).sum()

    return post_W, post_A, KL_W, KL_A

In [None]:
# @title Plotting functions
# @markdown

def plot_testing(results_seed, discrimination_seed, seeds, title):
    print(results_seed)
    print(discrimination_seed)

    Testing_graph_names = ["Suprathreshold stimulus", "Subthreshold stimulus", "Low Vision"]

    fig, ax = plt.subplots(figsize=(14, len(results_seed[0]) * 2 + 2))  # Adjusted for added header space
    ax.axis('off')
    ax.axis('tight')

    # Define column labels
    col_labels = ["Scenario", "F1 SCORE\n(2nd order network)", "RECALL\n(2nd order network)", "PRECISION\n(2nd order network)", "Discrimination Performance\n(1st order network)", "ACCURACY\n(2nd order network)"]

    # Initialize list to hold all rows of data including headers
    full_data = []

    # Calculate averages and standard deviations
    for i in range(len(results_seed[0])):
        metrics_list = [result[i]["metrics"][0] for result in results_seed]  # Collect metrics for each seed
        discrimination_list = [discrimination_seed[j][i] for j in range(seeds)]

        # Calculate averages and standard deviations for metrics
        avg_metrics = np.mean(metrics_list, axis=0).tolist()
        std_metrics = np.std(metrics_list, axis=0).tolist()

        # Calculate average and standard deviation for discrimination performance
        avg_discrimination = np.mean(discrimination_list)
        std_discrimination = np.std(discrimination_list)

        # Format the row with averages and standard deviations
        row = [
            Testing_graph_names[i],
            f"{avg_metrics[2]:.2f} ± {std_metrics[2]:.2f}",  # F1 SCORE
            f"{avg_metrics[1]:.2f} ± {std_metrics[1]:.2f}",  # RECALL
            f"{avg_metrics[0]:.2f} ± {std_metrics[0]:.2f}",  # PRECISION
            f"{avg_discrimination:.2f} ± {std_discrimination:.2f}",  # Discrimination Performance
            f"{avg_metrics[3]:.2f} ± {std_metrics[3]:.2f}"  # ACCURACY
        ]
        full_data.append(row)

    # Extract metric values for color scaling (excluding the first and last columns which are text)
    metric_values = np.array([[float(x.split(" ± ")[0]) for x in row[1:]] for row in full_data])  # Convert to float for color scaling
    max_value = np.max(metric_values)
    colors = metric_values / max_value  # Normalize for color mapping

    # Prepare colors for all cells, defaulting to white for non-metric cells
    cell_colors = [["white"] * len(col_labels) for _ in range(len(full_data))]
    for i, row in enumerate(colors):
        cell_colors[i][1] = plt.cm.RdYlGn(row[0])
        cell_colors[i][2] = plt.cm.RdYlGn(row[1])
        cell_colors[i][3] = plt.cm.RdYlGn(row[2])
        cell_colors[i][5] = plt.cm.RdYlGn(row[3])  # Adding color for accuracy

    # Adding color for discrimination performance
    discrimination_colors = colors[:, 3]
    for i, dp_color in enumerate(discrimination_colors):
        cell_colors[i][4] = plt.cm.RdYlGn(dp_color)

    # Create the main table with cell colors
    table = ax.table(cellText=full_data, colLabels=col_labels, loc='center', cellLoc='center', cellColours=cell_colors)
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.5, 1.5)

    # Set the height of the header row to be double that of the other rows
    for j, col_label in enumerate(col_labels):
        cell = table[(0, j)]
        cell.set_height(cell.get_height() * 2)

    # Add chance level table
    chance_level_data = [["Chance Level\nDiscrimination(1st)", "Chance Level\nAccuracy(2nd)"],
                         ["0.010", "0.50"]]

    chance_table = ax.table(cellText=chance_level_data, bbox=[1.0, 0.8, 0.3, 0.1], cellLoc='center', colWidths=[0.1, 0.1])
    chance_table.auto_set_font_size(False)
    chance_table.set_fontsize(10)
    chance_table.scale(1.2, 1.2)

    # Set the height of the header row to be double that of the other rows in the chance level table
    for j in range(len(chance_level_data[0])):
        cell = chance_table[(0, j)]
        cell.set_height(cell.get_height() * 2)

    plt.title(title, pad=20, fontsize=16)
    plt.show()
    plt.close(fig)


def plot_signal_max_and_indicator(patterns_tensor, plot_title="Training Signals"):
    """
    Plots the maximum values of signal units and a binary indicator for max values greater than 0.5.

    Parameters:
    - patterns_tensor: A tensor containing signals, where each signal is expected to have multiple units.
    """
    with plt.xkcd():

        # Calculate the maximum value of units for each signal within the patterns tensor
        max_values_of_units = patterns_tensor.max(dim=1).values.cpu().numpy()  # Ensure it's on CPU and in NumPy format for plotting

        # Determine the binary indicators based on the max value being greater than 0.5
        binary_indicators = (max_values_of_units > 0.5).astype(int)

        # Create a figure with 2 subplots (2 rows, 1 column)
        fig, axs = plt.subplots(2, 1, figsize=(8, 8))

        fig.suptitle(plot_title, fontsize=16)  # Set the overall title for the plot

        # First subplot for the maximum values of each signal
        axs[0].plot(range(patterns_tensor.size(0)), max_values_of_units, drawstyle='steps-mid')
        axs[0].set_xlabel('Pattern Number')
        axs[0].set_ylabel('Max Value of Signal Units')
        axs[0].set_ylim(-0.1, 1.1)  # Adjust y-axis limits for clarity
        axs[0].grid(True)

        # Second subplot for the binary indicators
        axs[1].plot(range(patterns_tensor.size(0)), binary_indicators, drawstyle='steps-mid', color='red')
        axs[1].set_xlabel('Pattern Number')
        axs[1].set_ylabel('Indicator (Max > 0.5) in each signal')
        axs[1].set_ylim(-0.1, 1.1)  # Adjust y-axis limits for clarity
        axs[1].grid(True)

        plt.tight_layout()
        plt.show()


def perform_quadratic_regression(epoch_list, values):
    # Perform quadratic regression
    coeffs = np.polyfit(epoch_list, values, 2)  # Coefficients of the polynomial
    y_pred = np.polyval(coeffs, epoch_list)        # Evaluate the polynomial at the given x values
    return y_pred


def pre_train_plots(epoch_1_order, epoch_2_order, title, max_values_indices):
    """
    Plots the training progress with regression lines and scatter plots of indices and values of max elements.

    Parameters:
    - epoch_list (list): List of epoch numbers.
    - epoch_1_order (list): Loss values for the first-order network over epochs.
    - epoch_2_order (list): Loss values for the second-order network over epochs.
    - title (str): Title for the plots.
    - max_values_indices (tuple): Tuple containing lists of max values and indices for both tensors.
    """
    (max_values_output_first_order,
     max_indices_output_first_order,
     max_values_patterns_tensor,
     max_indices_patterns_tensor) = max_values_indices

    # Perform quadratic regression for the loss plots
    epoch_list = list(range(len(epoch_1_order)))
    y_pred1 = perform_quadratic_regression(epoch_list, epoch_1_order)
    y_pred2 = perform_quadratic_regression(epoch_list, epoch_2_order)

    # Set up the plot with 2 rows and 2 columns
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))

    # First graph for 1st Order Network
    axs[0, 0].plot(epoch_list, epoch_1_order, linestyle='--', marker='o', color='g')
    axs[0, 0].plot(epoch_list, y_pred1, linestyle='-', color='r', label='Quadratic Fit')
    axs[0, 0].legend(['1st Order Network', 'Quadratic Fit'])
    axs[0, 0].set_title('1st Order Network Loss')
    axs[0, 0].set_xlabel('Epochs - Pretraining Phase')
    axs[0, 0].set_ylabel('Loss')

    # Second graph for 2nd Order Network
    axs[0, 1].plot(epoch_list, epoch_2_order, linestyle='--', marker='o', color='b')
    axs[0, 1].plot(epoch_list, y_pred2, linestyle='-', color='r', label='Quadratic Fit')
    axs[0, 1].legend(['2nd Order Network', 'Quadratic Fit'])
    axs[0, 1].set_title('2nd Order Network Loss')
    axs[0, 1].set_xlabel('Epochs - Pretraining Phase')
    axs[0, 1].set_ylabel('Loss')

    # Scatter plot of indices: patterns_tensor vs. output_first_order
    axs[1, 0].scatter(max_indices_patterns_tensor, max_indices_output_first_order, alpha=0.5)

    # Add quadratic regression line
    indices_regression = perform_quadratic_regression(max_indices_patterns_tensor, max_indices_output_first_order)
    axs[1, 0].plot(max_indices_patterns_tensor, indices_regression, color='skyblue', linestyle='--', label='Quadratic Fit')

    axs[1, 0].set_title('Stimuli location: First Order Input vs. First Order Output')
    axs[1, 0].set_xlabel('First Order Input Indices')
    axs[1, 0].set_ylabel('First Order Output Indices')
    axs[1, 0].legend()

    # Scatter plot of values: patterns_tensor vs. output_first_order
    axs[1, 1].scatter(max_values_patterns_tensor, max_values_output_first_order, alpha=0.5)

    # Add quadratic regression line
    values_regression = perform_quadratic_regression(max_values_patterns_tensor, max_values_output_first_order)
    axs[1, 1].plot(max_values_patterns_tensor, values_regression, color='skyblue', linestyle='--', label='Quadratic Fit')

    axs[1, 1].set_title('Stimuli Values: First Order Input vs. First Order Output')
    axs[1, 1].set_xlabel('First Order Input Values')
    axs[1, 1].set_ylabel('First Order Output Values')
    axs[1, 1].legend()

    plt.suptitle(title, fontsize=16, y=1.02)

    # Display the plots in a 2x2 grid
    plt.tight_layout()
    plt.savefig('Blindsight_Pre_training_Loss_{}.png'.format(title.replace(" ", "_").replace("/", "_")), bbox_inches='tight')
    plt.show()
    plt.close(fig)

# Function to configure the training environment and load the models
def config_training(first_order_network, second_order_network, hidden, factor, gelu):
    """
    Configures the training environment by saving the state of the given models and loading them back.
    Initializes testing patterns for evaluation.

    Parameters:
    - first_order_network: The first order network instance.
    - second_order_network: The second order network instance.
    - hidden: Number of hidden units in the first order network.
    - factor: Factor influencing the network's architecture.
    - gelu: Activation function to be used in the network.

    Returns:
    - Tuple of testing patterns, number of samples in the testing patterns, and the loaded model instances.
    """
    # Paths where the models' states will be saved
    PATH = './cnn1.pth'
    PATH_2 = './cnn2.pth'

    # Save the weights of the pretrained networks to the specified paths
    torch.save(first_order_network.state_dict(), PATH)
    torch.save(second_order_network.state_dict(), PATH_2)

    # Generating testing patterns for three different sets
    First_set, First_set_targets = create_patterns(0,factor)
    Second_set, Second_set_targets = create_patterns(1,factor)
    Third_set, Third_set_targets = create_patterns(2,factor)

    # Aggregate testing patterns and their targets for ease of access
    Testing_patterns = [[First_set, First_set_targets], [Second_set, Second_set_targets], [Third_set, Third_set_targets]]

    # Determine the number of samples from the first set (assumed consistent across all sets)
    n_samples = len(Testing_patterns[0][0])

    # Initialize and load the saved states into model instances
    loaded_model = FirstOrderNetwork(hidden, factor, gelu)
    loaded_model_2 = SecondOrderNetwork(gelu)

    loaded_model.load_state_dict(torch.load(PATH))
    loaded_model_2.load_state_dict(torch.load(PATH_2))

    # Ensure the models are moved to the appropriate device (CPU/GPU) and set to evaluation mode
    loaded_model.to(device)
    loaded_model_2.to(device)

    loaded_model.eval()
    loaded_model_2.eval()

    return Testing_patterns, n_samples, loaded_model, loaded_model_2

---

# Introduction

This bonus tutorial extends a lot of the content that was covered in Tutorial 1 based around the theme of Consciousness. At the end of Section 2. We discussed and implemented a lot of ideas around first-order models and we briefly mentioned second-order models. In this tutorial, we're going to actually develop some ideas and model the effects of blindsight, the phenomenon we introduced earlier on today, where patients have no conscious experience of sight but are able to navigate around objects (showing that their brains are processing sensory information, but it doesn't reach the level of subjective experience). We first introduce the coding of the first-order model, followed by the second-order model. Then we show you some ways to plot the results from these models.

After this we end on some further high-level thoughts on the theme of consciousness. 


## Section 1: Train a First-Order Network

This section invites you to engage with a straightforward, auto-generated dataset on blindsight, originally introduced by [Pasquali et al. in 2010](https://www.sciencedirect.com/science/article/abs/pii/S0010027710001794). Blindsight is a fascinating condition where individuals who are cortically blind due to damage in their primary visual cortex can still respond to visual stimuli without conscious perception. This intriguing phenomenon underscores the intricate nature of sensory processing and the brain's ability to process information without conscious awareness.

In [None]:
# Visualize the autogenerated data
factor=2
initialize_global()
set_pre, _ = create_patterns(0,factor)
plot_signal_max_and_indicator(set_pre.detach().cpu(), "Example - Pre training dataset")

The pre-training dataset for the network consisted of 200 patterns. These were evenly divided: half were purely noise (with unit activations randomly chosen between 0.0 and 0.02), and the other half represented potential stimuli. In the stimulus patterns, 99 out of 100 units had activations ranging between 0.0 and 0.02, with one unique unit having an activation between 0.0 and 1.0.

**Testing patterns**

As we have seen before, the network underwent evaluations under three distinct conditions, each modifying the signal-to-noise ratio in a unique way to explore different degrees and types of blindness.

Suprathreshold stimulus condition: here, the network was exposed to the identical set of 200 patterns used during pre-training, testing the network's response to familiar inputs.

Subthreshold stimulus condition (blindsight simulation): this condition aimed to mimic blindsight. It was achieved by introducing a slight noise increment (+0.0012) to every input of the first-order network, barring the one designated as the stimulus. This setup tested the network's ability to discern faint signals amidst noise.

Low vision condition: to simulate low vision, the activation levels of the stimuli were reduced. Unlike the range from 0.0 to 1.0 used in pre-training, the stimuli's activation levels were adjusted to span from 0.0 to 0.3. This condition examined the network's capability to recognize stimuli with diminished intensity.

In [None]:
factor=2
# Compare your results with the patterns generate below
set_1, _ = create_patterns(0,factor)
set_2, _ = create_patterns(1,factor)
set_3, _ = create_patterns(2,factor)

# Plot
plot_signal_max_and_indicator(set_1.detach().cpu(), "Suprathreshold dataset")
plot_signal_max_and_indicator(set_2.detach().cpu(), "Subthreshold dataset")
plot_signal_max_and_indicator(set_3.detach().cpu(), "Low Vision dataset")

### Coding Exercise 1: Building a network for a blindsight situation

In this activity, we'll construct a neural network model using our auto-generated dataset, focusing on blindsight scenarios. The model will primarily consist of fully connected layers, establishing a straightforward, first-order network. The aim here is to assess the basic network's performance.

**Steps to follow**

1. Examine the network architecture: understand the structure of the neural network you're about to work with.
2. Visualize loss metrics: observe and analyze the network's performance during pre-training by visualizing the loss over epochs.
3. Evaluate the model: use the provided code snippets to calculate and interpret the model's accuracy, recall, and F1-score, giving you insight into the network's capabilities.

**Understanding the process**

The goal is to gain a thorough comprehension of the network's architecture and to interpret the pre-training results visually. This will provide a clearer picture of the model's potential and limitations.

The network is designed as a backpropagation autoassociator. It features a 100-unit input layer, directly linked to a 40-unit hidden layer, which in turn connects to a 100-unit output layer. Initial connection weights are set within the range of -1.0 to 1.0 for the first-order network. To mitigate overfitting, dropout is employed within the network architecture. The architecture includes a configurable activation function. This flexibility allows for adjustments and tuning in Activity 3, aiming for optimal model performance.

In [None]:
class FirstOrderNetwork(nn.Module):
    def __init__(self, hidden_units, data_factor, use_gelu):
        """
        Initializes the FirstOrderNetwork with specific configurations.

        Parameters:
        - hidden_units (int): The number of units in the hidden layer.
        - data_factor (int): Factor to scale the amount of data processed.
                             A factor of 1 indicates the default data amount,
                             while 10 indicates 10 times the default amount.
        - use_gelu (bool): Flag to use GELU (True) or ReLU (False) as the activation function.
        """
        super(FirstOrderNetwork, self).__init__()

        # Define the encoder, hidden, and decoder layers with specified units

        self.fc1 = nn.Linear(100, hidden_units, bias = False) # Encoder
        self.hidden= nn.Linear(hidden_units, hidden_units, bias = False) # Hidden
        self.fc2 = nn.Linear(hidden_units, 100, bias = False) # Decoder

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()


        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(0.1)

        # Set the data factor
        self.data_factor = data_factor

        # Other activation functions for various purposes
        self.softmax = nn.Softmax()

        # Initialize network weights
        self.initialize_weights()

    def initialize_weights(self):
        """Initializes weights of the encoder, hidden, and decoder layers uniformly."""
        init.uniform_(self.fc1.weight, -1.0, 1.0)

        init.uniform_(self.fc2.weight, -1.0, 1.0)
        init.uniform_(self.hidden.weight, -1.0, 1.0)

    def encoder(self, x):
      h1 = self.dropout(self.relu(self.fc1(x.view(-1, 100))))
      return h1

    def decoder(self,z):
      #h2 = self.relu(self.hidden(z))
      h2 = self.sigmoid(self.fc2(z))
      return h2


    def forward(self, x):
      """
      Defines the forward pass through the network.

      Parameters:
      - x (Tensor): The input tensor to the network.

      Returns:
      - Tensor: The output of the network after passing through the layers and activations.
      """
      h1 = self.encoder(x)
      h2 = self.decoder(h1)

      return h1 , h2

For now, we will train the first-order network only.

In [None]:
# Define the architecture, optimizers, loss functions, and schedulers for pre training
seeds=15

results_seed=[]
discrimination_seed=[]

# Hyperparameters
optimizer="ADAMAX"
hidden=40
factor=2
gelu=False
gam=0.98
meta=True
stepsize=25

for i in range(seeds):
  print(f"Seed {i}")

  # Compare your results with the patterns generate below
  initialize_global()

  # Prepare networks, loss functions, optimizers, and schedulers for pre-training
  first_order_network, second_order_network, criterion_1, criterion_2, optimizer_1, optimizer_2, scheduler_1, scheduler_2 = prepare_pre_training(hidden, factor, gelu, stepsize, gam)

  # Conduct pre-training for both the first-order and second-order networks
  first_order_network_pre, second_order_network_pre, epoch_1_order, epoch_2_order , max_value_indices = pre_train(first_order_network, second_order_network, criterion_1,  criterion_2, optimizer_1, optimizer_2, scheduler_1, scheduler_2, factor, meta)

  # Plot the training progress of both networks to visualize performance and learning trends
  pre_train_plots(epoch_1_order, epoch_2_order, f"1st & 2nd Order Networks - Seed {i}" , max_value_indices )

  # Configuration step for the main training phase or evaluation
  testing_patterns, n_samples = get_test_patterns(factor)

  # Function to test the model using the configured testing patterns
  first_order_network_pre.eval()
  second_order_network_pre.eval()
  f1_scores_wager, mse_losses_indices , mse_losses_values , discrimination_performances, results_for_plotting = testing(testing_patterns, n_samples, first_order_network_pre, second_order_network_pre,factor)
  results_seed.append(results_for_plotting)
  discrimination_seed.append(discrimination_performances)

plot_testing(results_seed, discrimination_seed, seeds, "Test Results")

In [None]:
# Define the architecture, optimizers, loss functions, and schedulers for pre training

# Hyperparameters
global optimizer ,n_epochs , learning_rate_1
learning_rate_1 = 0.5
n_epochs = 100
optimizer="ADAMAX"
hidden=40
factor=2
gelu=False
gam=0.98
meta=True
stepsize=25
initialize_global()


# Networks instantiation
first_order_network = FirstOrderNetwork(hidden,factor,gelu).to(device)
second_order_network = SecondOrderNetwork(gelu).to(device) # We define it, but won't use it until activity 3

# Loss function
criterion_1 = CAE_loss

# Optimizer
optimizer_1 = optim.Adamax(first_order_network.parameters(), lr=learning_rate_1)

# Learning rate schedulers
scheduler_1 = StepLR(optimizer_1, step_size=stepsize, gamma=gam)

max_values_output_first_order = []
max_indices_output_first_order = []
max_values_patterns_tensor = []
max_indices_patterns_tensor = []

# Training loop
for epoch in range(n_epochs):
    # Generate training patterns and targets for each epoch.
    patterns_tensor, stim_present_tensor, stim_absent_tensor, order_2_tensor = generate_patterns(patterns_number, num_units,factor, 0)

    # Forward pass through the first-order network
    hidden_representation , output_first_order = first_order_network(patterns_tensor)

    output_first_order=output_first_order.requires_grad_(True)

    # Skip computations for the second-order network
    with torch.no_grad():

        # Potentially forward pass through the second-order network without tracking gradients
        output_second_order = second_order_network(patterns_tensor, output_first_order)

    # Calculate the loss for the first-order network (accuracy of stimulus representation)
    W = first_order_network.state_dict()['fc1.weight']
    loss_1 = criterion_1( W, stim_present_tensor.view(-1, 100), output_first_order,
                        hidden_representation, lam )
    # Backpropagate the first-order network's loss
    loss_1.backward()

    # Update first-order network weights
    optimizer_1.step()

    # Reset first-order optimizer gradients to zero for the next iteration

    # Update the first-order scheduler
    scheduler_1.step()

    epoch_1_order[epoch] = loss_1.item()

    # Get max values and indices for output_first_order
    max_vals_out, max_inds_out = torch.max(output_first_order[100:], dim=1)
    max_inds_out[max_vals_out == 0] = 0
    max_values_output_first_order.append(max_vals_out.tolist())
    max_indices_output_first_order.append(max_inds_out.tolist())

    # Get max values and indices for patterns_tensor
    max_vals_pat, max_inds_pat = torch.max(patterns_tensor[100:], dim=1)
    max_inds_pat[max_vals_pat == 0] = 0
    max_values_patterns_tensor.append(max_vals_pat.tolist())
    max_indices_patterns_tensor.append(max_inds_pat.tolist())


max_values_indices = (max_values_output_first_order[-1],
            max_indices_output_first_order[-1],
            max_values_patterns_tensor[-1],
            max_indices_patterns_tensor[-1])


# Plot training loss curve
pre_train_plots(epoch_1_order, epoch_2_order, "1st & 2nd Order Networks" , max_value_indices )

### Testing under 3 Blindsight Conditions

We will now use the testing auto-generated datasets from activity 1 to test the network's performance.

In [None]:
results_seed=[]
discrimination_seed=[]

# Prepare networks for testing by calling the configuration function
testing_patterns, n_samples, loaded_model, loaded_model_2 = config_training(first_order_network, second_order_network, hidden, factor, gelu)

# Perform testing using the defined function and plot the results
f1_scores_wager, mse_losses_indices , mse_losses_values , discrimination_performances, results_for_plotting = testing(testing_patterns, n_samples, loaded_model, loaded_model_2,factor)

results_seed.append(results_for_plotting)
discrimination_seed.append(discrimination_performances)
# Assuming plot_testing is defined, call it to display results
plot_testing(results_seed,discrimination_seed,  1, "Seed")

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_First_order_network")

## Section 2: Train a Second-Order network

Having previously examined the first-order network, we now switch to the second-order network, described in more detail back in Tutorial 1 (please revisit the text and video content there if you need to recap the concepts or want to refresh your understanding of the difference between these models )
To study this, we use a simulated dataset that mimics the conditions of blindsight. This dataset contains 400 patterns, equally split between two types:

- **Random noise patterns** consist of low activations ranging between 0.0 and 0.02.
- **Designed stimulus patterns** - each pattern includes one unit that shows a higher activation level, varying between 0.0 and 1.0.

This dataset allows us to test hypotheses concerning how sensory processing and network responses adapt under different conditions of visual impairment.

We have three main testing scenarios, each designed to alter the signal-to-noise ratio to simulate different levels of visual impairment:

- **Suprathreshold stimulus condition**: here, the network is tested against familiar patterns used during training to assess its response to known stimuli.
- **Subthreshold stimulus condition**: this condition slightly increases the noise level, akin to actual blindsight conditions, testing the network's capability to discern subtle signals.
- **Low vision condition**: the intensity of stimuli is decreased to evaluate how well the network performs with significantly reduced sensory input.

In [None]:
factor=2

initialize_global()
set_1, _ = create_patterns(0,factor)
set_2, _ = create_patterns(1,factor)
set_3, _ = create_patterns(2,factor)

# Plot
plot_signal_max_and_indicator(set_1.detach().cpu(), "Suprathreshold dataset")
plot_signal_max_and_indicator(set_2.detach().cpu(), "Subthreshold dataset")
plot_signal_max_and_indicator(set_3.detach().cpu(), "Low Vision dataset")

The first-order network model lays the groundwork for our experiments and is structured as follows:

- Input layer: consists of 100 units representing either noise or stimulus patterns.
- Hidden layer: includes a 40-unit layer tasked with processing the inputs.
- Output layer: comprises 100 units where the responses to stimuli are recorded.
- Dropout and activation: includes dropout layers to prevent overfitting and a temperature-controlled activation function to fine-tune response sharpness.

The primary aim of the first-order network is to accurately capture and react to the input patterns, setting a baseline for comparison with more complex models.

### Coding Exercise 2: Developing a Second-Order Network

Your task is to expand upon the first-order network by integrating a second-order network that incorporates a metacognitive layer assessing the predictions of the first-order network. This metacognitive layer introduces a wagering mechanism, wherein the network "bets" on its confidence in its predictions. 

- The first-order network is designed as an autoencoder, a type of neural network trained to reconstruct the input stimulus. The autoencoder consists of an encoder that compresses the input into a latent representation and a decoder that reconstructs the input from this representation.
- The second-order network, or metacognitive layer, operates by examining the difference (delta) between the original input and the output generated by the autoencoder. This difference provides insight into the reconstruction error, which is a measure of how accurately the autoencoder has learned to replicate the input data. By evaluating this reconstruction error, the second-order network can make a judgement about the certainty of the first-order network's predictions.

These are the steps for completion:

1. Architectural development: grasp the underlying principles of a second-order network and complete the architectural code.
2. Performance evaluation: visualize training losses and test the model using provided code, assessing its initial performance.
3. Model fine-tuning: leveraging the provided training function, experiment with fine-tuning the model to enhance its accuracy and efficiency.

The second-order network is structured as a feedforward backpropagation network.

- Input layer: comprises a 100-unit comparison matrix. This matrix quantifies the discrepancy between each corresponding pair of input and output units from the first-order network. For example, if an input unit and its corresponding output unit have activations of 0.6 and 0.7, respectively, the comparison unit's activation would be -0.1. This setup essentially encodes the prediction error of the first-order network's outputs as an input pattern for the second-order network.
- Output layer: consists of two units representing "high" and "low" wagers, indicating the network's confidence in its predictions. The initial weights for these output units range between 0.0 and 0.1.
- Comparator weights: set to 1.0 for connections from the first-order input layer to the comparison matrix, and -1.0 for connections from the first-order output layer. This configuration emphasizes the differential error as a critical input for the second-order decision-making process.

The second-order network's novel approach uses the error generated by the first-order network as a direct input for making decisions—specifically, wagering on the confidence of its outputs. This methodology reflects a metacognitive layer of processing, akin to evaluating one's confidence in their answers or predictions.

By exploring these adjustments, you can optimize the network's functionality, making it a powerful tool for understanding and simulating complex cognitive phenomena like blindsight.

In [None]:
class SecondOrderNetwork(nn.Module):
    def __init__(self, use_gelu):
        super(SecondOrderNetwork, self).__init__()
        # Define a linear layer for comparing the difference between input and output of the first-order network
        self.comparison_layer = nn.Linear(100, 100)

        # Linear layer for determining wagers, mapping from 100 features to a single output
        self.wager = nn.Linear(100, 1)

        # Dropout layer to prevent overfitting by randomly setting input units to 0 with a probability of 0.5 during training
        self.dropout = nn.Dropout(0.5)

        # Select activation function based on the `use_gelu` flag
        self.activation = torch.relu

        # Additional activation functions for potential use in network operations
        self.sigmoid = torch.sigmoid

        self.softmax = nn.Softmax()

        # Initialize the weights of the network
        self._init_weights()

    def _init_weights(self):
        # Uniformly initialize weights for the comparison and wager layers
        init.uniform_(self.comparison_layer.weight, -1.0, 1.0)
        init.uniform_(self.wager.weight, 0.0, 0.1)

    def _init_weights(self):
        # Uniformly initialize weights for the comparison and wager layers
        init.uniform_(self.comparison_layer.weight, -1.0, 1.0)
        init.uniform_(self.wager.weight, 0.0, 0.1)

    def forward(self, first_order_input, first_order_output):
        ############################################################
        # Fill in the wager value
        # Applying dropout and sigmoid activation to the output of the wager layer
        raise NotImplementedError("Student exercise")
        ############################################################

        # Calculate the difference between the first-order input and output
        comparison_matrix = first_order_input - first_order_output

        #Another option is to directly calculate the per unit MSE to use as input for the comparator matrix
        #comparison_matrix = nn.MSELoss(reduction='none')(first_order_output, first_order_input)

        # Pass the difference through the comparison layer and apply the chosen activation function
        comparison_out=self.dropout(self.activation(self.comparison_layer(comparison_matrix)))

        # Calculate the wager value, applying dropout and sigmoid activation to the output of the wager layer
        wager = ...

        return wager

In [None]:
# to_remove solution
class SecondOrderNetwork(nn.Module):
    def __init__(self, use_gelu):
        super(SecondOrderNetwork, self).__init__()
        # Define a linear layer for comparing the difference between input and output of the first-order network
        self.comparison_layer = nn.Linear(100, 100)

        # Linear layer for determining wagers, mapping from 100 features to a single output
        self.wager = nn.Linear(100, 1)

        # Dropout layer to prevent overfitting by randomly setting input units to 0 with a probability of 0.5 during training
        self.dropout = nn.Dropout(0.5)

        # Select activation function based on the `use_gelu` flag
        self.activation = torch.relu

        # Additional activation functions for potential use in network operations
        self.sigmoid = torch.sigmoid

        self.softmax = nn.Softmax()

        # Initialize the weights of the network
        self._init_weights()

    def _init_weights(self):
        # Uniformly initialize weights for the comparison and wager layers
        init.uniform_(self.comparison_layer.weight, -1.0, 1.0)
        init.uniform_(self.wager.weight, 0.0, 0.1)

    def forward(self, first_order_input, first_order_output):
        # Calculate the difference between the first-order input and output
        comparison_matrix = first_order_input - first_order_output

        #Another option is to directly calculate the per unit MSE to use as input for the comparator matrix
        #comparison_matrix = nn.MSELoss(reduction='none')(first_order_output, first_order_input)

        # Pass the difference through the comparison layer and apply the chosen activation function
        comparison_out=self.dropout(self.activation(self.comparison_layer(comparison_matrix)))

        # Calculate the wager value, applying dropout and sigmoid activation to the output of the wager layer
        wager = self.sigmoid(self.wager(comparison_out))

        return wager

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_Second_Order_Network")

In [None]:
# First order network instantiation
first_order_network = FirstOrderNetwork(hidden, factor, gelu).to(device)

# Define the architecture, optimizers, loss functions, and schedulers for pre training
seeds=15

results_seed=[]
discrimination_seed=[]

# Hyperparameters
optimizer="ADAMAX"
hidden=40
factor=2
gelu=False
gam=0.98
meta=True
stepsize=25

for i in range(seeds):
  print(f"Seed {i}")

  # Compare your results with the patterns generate below
  initialize_global()

  # Prepare networks, loss functions, optimizers, and schedulers for pre-training
  first_order_network, second_order_network, criterion_1, criterion_2, optimizer_1, optimizer_2, scheduler_1, scheduler_2 = prepare_pre_training(hidden, factor, gelu, stepsize, gam)

  # Conduct pre-training for both the first-order and second-order networks
  first_order_network_pre, second_order_network_pre, epoch_1_order, epoch_2_order , max_value_indices = pre_train(first_order_network, second_order_network, criterion_1,  criterion_2, optimizer_1, optimizer_2, scheduler_1, scheduler_2, factor, meta)

  # Plot the training progress of both networks to visualize performance and learning trends
  pre_train_plots(epoch_1_order, epoch_2_order, f"1st & 2nd Order Networks - Seed {i}" , max_value_indices )

  # Configuration step for the main training phase or evaluation
  testing_patterns, n_samples = get_test_patterns(factor)

  # Function to test the model using the configured testing patterns
  first_order_network_pre.eval()
  second_order_network_pre.eval()
  f1_scores_wager, mse_losses_indices , mse_losses_values , discrimination_performances, results_for_plotting = testing(testing_patterns, n_samples, first_order_network_pre, second_order_network_pre,factor)
  results_seed.append(results_for_plotting)
  discrimination_seed.append(discrimination_performances)

plot_testing(results_seed, discrimination_seed, seeds, "Test Results")

### Discussion point

Let's dive into the outcomes!

- Did you notice any variations between the two models?
- Can you explain how these differences influenced the performance?
- What role does a second-order network play, and in which situations would it be more effective?

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_Discussion_Point_Second_Order_Network")

In [None]:
# @title Video 1: Second Order Network

from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display

class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)

def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents

video_ids = [('Youtube', 'lHRP14mxXv8'), ('Bilibili', 'BV1jM4m1S7ek')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_Video_1")

### Coding Exercise 3: Plot Surfaces for Content / Awareness Inference

To explore the properties of the HOSS model, we can simulate inference at different levels of the hierarchy over the full 2D space of possible input X's. The left panel below shows that the probability of awareness (of any stimulus contents) rises in a graded manner from the lower left corner of the graph (low activation of any feature) to the upper right (high activation of both features). In contrast, the right panel shows that confidence in making a discrimination response (e.g. rightward vs. leftward) increases away from the major diagonal, as the model becomes sure that the sample was generated by either a leftward or rightward tilted stimulus.

Together, the two surfaces make predictions about the relationships we might see between discrimination confidence and awareness in a simple psychophysics experiment. One notable prediction is that discrimination could still be possible - and lead to some degree of confidence - even when the higher-order node is "reporting" unawareness of the stimulus.

Now, let's get hands on and plot those auto-generated patterns!


In [None]:
def HOSS_evaluate(X, mu, Sigma, Aprior, Wprior):
    """
    Inference on 2D Bayes net for asymmetric inference on presence vs. absence.
    """

    # Initialise variables and conditional prob tables
    p_A = np.array([1 - Aprior, Aprior])  # prior on awareness state A
    p_W_a1 = np.append(0, Wprior)  # likelihood of world states W given aware, first entry is absence
    p_W_a0 = np.append(1, np.zeros(len(Wprior)))  # likelihood of world states W given unaware, first entry is absence
    p_W = (p_W_a1 + p_W_a0) / 2  # prior on W marginalising over A (for KL)

    # Compute likelihood of observed X for each possible W (P(X|mu_w, Sigma))
    lik_X_W = np.array([multivariate_normal.pdf(X, mean=mu_i, cov=Sigma) for mu_i in mu])
    p_X_W = lik_X_W / lik_X_W.sum()  # normalise to get P(X|W)

    # Combine with likelihood of each world state w given awareness state A
    lik_W_A = np.vstack((p_X_W * p_W_a0 * p_A[0], p_X_W * p_W_a1 * p_A[1]))
    post_A = lik_W_A.sum(axis=1)  # sum over W
    post_A = post_A / post_A.sum()  # normalise

    # Posterior over W (P(W|X=x) marginalising over A)
    post_W = lik_W_A.sum(axis=0)  # sum over A
    post_W = post_W / post_W.sum()  # normalise

    # KL divergences
    KL_W = (post_W * np.log(post_W / p_W)).sum()
    KL_A = (post_A * np.log(post_A / p_A)).sum()

    return post_W, post_A, KL_W, KL_A

In [None]:
# Define the grid
xgrid = np.arange(0, 2.01, 0.01)

# Define the means for the Gaussian distributions
mu = np.array([[0.5, 0.5], [0.5, 1.5], [1.5, 0.5]])

# Define the covariance matrix
Sigma = np.array([[1, 0], [0, 1]])

# Prior probabilities
Wprior = np.array([0.5, 0.5])
Aprior = 0.5

# Initialize arrays to hold confidence and posterior probability
confW = np.zeros((len(xgrid), len(xgrid)))
posteriorAware = np.zeros((len(xgrid), len(xgrid)))
KL_w = np.zeros((len(xgrid), len(xgrid)))
KL_A = np.zeros((len(xgrid), len(xgrid)))

# Compute confidence and posterior probability for each point in the grid
for i, xi in tqdm(enumerate(xgrid), total=len(xgrid), desc='Outer Loop'):
    for j, xj in enumerate(xgrid):
        X = [xi, xj]
        post_w, post_A, KL_w[i, j], KL_A[i, j] = HOSS_evaluate(X, mu, Sigma, Aprior, Wprior)
        confW[i, j] = max(post_w[1], post_w[2])
        posteriorAware[i, j] = post_A[1]

with plt.xkcd():

    # Plotting
    plt.figure(figsize=(10, 5))

    # Posterior probability "seen"
    plt.subplot(1, 2, 1)
    plt.contourf(xgrid, xgrid, posteriorAware.T)
    plt.colorbar()
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('Posterior probability "seen"')
    plt.axis('square')

    # Confidence in identity
    plt.subplot(1, 2, 2)
    contour_set = plt.contourf(xgrid, xgrid, confW.T)
    plt.colorbar()
    plt.contour(xgrid, xgrid, posteriorAware.T, levels=[0.5], linewidths=4, colors=['white'])  # Line contour for threshold
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('Confidence in identity')
    plt.axis('square')

    plt.show()

### Simulate KL-divergence surfaces

We can also simulate KL-divergences (a measure of Bayesian surprise) at each layer in the network, which under predictive coding models of the brain, has been proposed to scale with neural activation (e.g., Friston, 2005; Summerfield & de Lange, 2014).

In [None]:
# Define the grid
xgrid = np.arange(0, 2.01, 0.01)

# Define the means for the Gaussian distributions
mu = np.array([[0.5, 0.5], [0.5, 1.5], [1.5, 0.5]])

# Define the covariance matrix
Sigma = np.array([[1, 0], [0, 1]])

# Prior probabilities
Wprior = np.array([0.5, 0.5])
Aprior = 0.5

# Initialize arrays to hold confidence and posterior probability
confW = np.zeros((len(xgrid), len(xgrid)))
posteriorAware = np.zeros((len(xgrid), len(xgrid)))
KL_w = np.zeros((len(xgrid), len(xgrid)))
KL_A = np.zeros((len(xgrid), len(xgrid)))

# Compute confidence and posterior probability for each point in the grid
for i, xi in enumerate(xgrid):
    for j, xj in enumerate(xgrid):
        X = [xi, xj]
        post_w, post_A, KL_w[i, j], KL_A[i, j] = HOSS_evaluate(X, mu, Sigma, Aprior, Wprior)

        confW[i, j] = max(post_w[1], post_w[2])
        posteriorAware[i, j] = post_A[1]

# Calculate the mean K-L divergence for absent and present awareness states
KL_A_absent = np.mean(KL_A[posteriorAware < 0.5])
KL_A_present = np.mean(KL_A[posteriorAware >= 0.5])
KL_w_absent = np.mean(KL_w[posteriorAware < 0.5])
KL_w_present = np.mean(KL_w[posteriorAware >= 0.5])

with plt.xkcd():

    # Plotting
    plt.figure(figsize=(18, 6))

    # K-L divergence, perceptual states
    plt.subplot(1, 2, 1)
    plt.contourf(xgrid, xgrid, KL_w.T, cmap='viridis')
    plt.colorbar()
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('KL-divergence, perceptual states')
    plt.axis('square')

    # K-L divergence, awareness state
    plt.subplot(1, 2, 2)
    plt.contourf(xgrid, xgrid, KL_A.T, cmap='viridis')
    plt.colorbar()
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('KL-divergence, awareness state')
    plt.axis('square')

    plt.show()

### Discussion point

Can you recognise the difference between the KL divergence for the W-level and the one for the A-level?

In [None]:
# to_remove explanation
"""
At the level of perceptual states W, there is a substantial asymmetry in the KL-divergence expected when the
model says ‘seen’ vs. ‘unseen’ (lefthand panel). This is due to the large belief updates invoked in the
perceptual layer W by samples that deviate from the lower lefthand corner - from absence. In contrast, when
we compute KL-divergence for the A-level (righthand panel), the level of prediction error is symmetric across
seen and unseen decisions, leading to "hot" zones both at the upper righthand (present) and lower lefthand
(absent) corners of the 2D space.

Intuitively, this means that at the W-level, there's a noticeable difference in the KL-divergence values
between "seen" and "unseen" predictions. This large difference is mainly due to significant updates in the
model's beliefs at this level when the detected samples are far from what is expected under the condition of
"absence." However, when we analyze the K-L divergence at the A-level, the discrepancies in prediction errors
between "seen" and "unseen" are balanced. This creates equally strong responses in the model, whether something
is detected or not detected.

We can also sort the KL-divergences as a function of whether the model "reported" presence or absence. As
can be seen in the bar plots below, there is more asymmetry in the prediction error at the W compared to the
A levels.

"""

In [None]:
with plt.xkcd():

    # Create figure with specified size
    plt.figure(figsize=(10, 5))

    # KL divergence for W states
    plt.subplot(1, 2, 1)
    plt.bar(['unseen', 'seen'], [KL_w_absent, KL_w_present], color='k')
    plt.ylabel('KL divergence, W states')
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    # KL divergence for A states
    plt.subplot(1, 2, 2)
    plt.bar(['unseen', 'seen'], [KL_A_absent, KL_A_present], color='k')
    plt.ylabel('KL divergence, A states')
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    plt.tight_layout()

    # Show plot
    plt.show()

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_HOSS_Bonus_Content")

---
# Discussion
This section contains an extra discussion exercise if you have time and inclination.

In this bonus section, Megan and Anil will delve into the complexities of defining and testing for consciousness, particularly in the context of artificial intelligence. We will explore various theoretical perspectives, examine classic and contemporary tests for consciousness, and discuss the challenges and ethical implications of determining whether a system truly possesses conscious experience.

In [None]:
# @title Video 2: Consciousness Bonus Content

from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display

class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)

def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents

video_ids = [('Youtube', '00dL8q7WgcU'), ('Bilibili', 'BV12n4y1Q7C2')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_Video_2")

## Discussion activity: Is it actually conscious?

We discussed the difference between these two...
- "Forward" tests: passing means the machine is conscious (or intelligent).
- "Reverse" tests: passing means humans are convinced that a machine is conscious (or intelligent).

**Discuss!** If a system (AI, other animal, other human) exhibited all the "right signs" of being conscious, how can we know for sure it is actually conscious? How could you design a test to be a true forward test?

- Room 1: I think you could design a forward test in this way... [share your ideas]
- Room 2: I think a forward test is impossible, and here's why [share your ideas]

In [None]:
# @title Submit your feedback
content_review(f"{feedback_prefix}_Discussion_activity")