<a href="https://colab.research.google.com/github/luca-g97/Master-Thesis/blob/main/Interactive_AI_Playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initialization: Imports and Dependencies

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }
!export TF_ENABLE_ONEDNN_OPTS=0

# Suppress FutureWarning globally
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Check for local host time. If locally hosted, make sure all dependencies are generated
runningLocal = False if 'google.colab' in str(get_ipython()) else True
print("Running locally" if runningLocal else "Running in Colab")

# if(runningLocal):
import sys
import site

# Add the user site-packages directory to the Python path (for local usage especially)
site.addsitedir('/tf/.local/lib/python3.11/site-packages')

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }

!python3 -m pip install -q --upgrade pip

# # Install specific versions of the libraries
# !pip install -q torch==2.4.1
# !pip install -q keras==3.4.1
# !pip install -q tqdm==4.66.5
# !pip install -q plotly==5.24.1
# !pip install -q ipython==7.34.0

**If something doesn´t work please uncomment the block above and install the specific versions - those were the ones we last tested with.**

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }

import timeit

#Libraries related to Torch
import torch
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F

#Libraries needed for MNIST dataset
from tqdm import tqdm
from keras.datasets import mnist
from keras.utils import to_categorical

#Libraries needed for HSV-RGB
import colorsys

#Libraries needed for LLM dataset
!pip install -q lorem==0.1.1
!pip install -q tiktoken==0.8.0
#!pip install -q numpy==1.26.4 scipy spacy
#!python -m spacy download en_core_web_sm -q
#import spacy
import lorem
import tiktoken
import random
from transformers import GPT2Tokenizer
#!pip install -q datasets
#from datasets import load_dataset
!pip install -q stanza
import stanza
# Suppress logging from stanza
nlp = stanza.Pipeline('en', verbose=False)
stanza.download('en', verbose=False)

#Library needed for correct displaying widgets in Google Colab
!pip install -q ipywidgets==7.7.1

#Libraries needed for Visualization
!pip install -q plotly
from IPython.display import display, clear_output
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'colab'

#Libraries needed for Zipping
import io
import pandas as pd
!pip install -q pyarrow
import pyarrow as pa
!pip install -q pyarrow-parquet
import pyarrow.parquet as pq
!pip install -q zstandard
import zstandard as zstd

#Set the correct device. Prefer a graphics card (cuda) if possible
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using device type: ", device)

**If the notebook is run via Colab, download all necessary files from Github if they aren´t downloaded yet. If you run it locally, please provide them yourself to be able to change them at runtime. We recommend using the following docker image for this: https://hub.docker.com/r/lucag97/colab-server**

In [None]:
#@title Click `Show code` in the code cell. { display-mode: "form" }

import os

# Define the files and directories you expect to be present
expected_files = ['Customizable_RENN.py','Images_HSVRGB.py','Images_MNIST.py','LLM_Small1x1.py', 'LLM_Verdict.py', 'Widgets.py']

# Check for each expected file and directory
def checkIfAllFilesArePresent(expected_files):
    for item in expected_files:
        #path = os.path.join("/content/", item)
        path = os.path.join("./", item)
        if not os.path.exists(path):
            print(f'Missing: {path}')
            return False
    return True

#If not locally running and not all files are present
if not ((checkIfAllFilesArePresent(expected_files)) and runningLocal):
    #!rm -rf /content/*
    !rm -rf ./*
    #Requesting the relevant files from Github
    #!git clone "https://github.com/luca-g97/Master-Thesis.git" /content/Interactive-AI-Playground
    !git clone "https://github.com/luca-g97/Master-Thesis.git" ./Interactive-AI-Playground

    #Move the files into the content folder and delete the original folder afterwards
    #!mv /content/Interactive-AI-Playground/Google_Colab-Interactive_AI_Playground/* /content/
    !mv ./Interactive-AI-Playground/Google_Colab-Interactive_AI_Playground/* ./
    #!rm -rf /content/Interactive-AI-Playground/
    !rm -rf ./Interactive-AI-Playground/

#Initialization: Datasets

**MNIST: Number Classification**

In [None]:
import Images_MNIST as MNIST
MNIST.initializePackages(mnist, to_categorical, nn, DataLoader, device)
trainSetMNIST, testSetMNIST = MNIST.createTrainAndTestSet()

**HSV -> RGB Conversion**

In [None]:
import Images_HSVRGB as HSVRGB
HSVRGB.initializePackages(colorsys, go, pio, DataLoader, device)
#Set visualize to True if you want to see the 3D-cube containing both the train and test samples
trainSetHSVRGB, testSetHSVRGB = HSVRGB.createTrainAndTestSet(50000, 10000, visualize=False)

**Small 1x1: LLM-Sourcecheck**

In [None]:
import LLM_Small1x1 as Small1x1
Small1x1.initializePackages(random, lorem, device, tiktoken, DataLoader)
small1x1 = Small1x1.createTrainAndTestSet(100)

**The Verdict: LLM-Sourcecheck**

In [None]:
import LLM_Verdict as Verdict
Verdict.initializePackages(random, lorem, device, tiktoken, DataLoader, nlp, GPT2Tokenizer)
#verdict = Verdict.createTrainSet()
verdict = Verdict.createWikiTrainSet("sports")
#verdict = Verdict.createEnglishWikiTrainSet("./english_wikipedia/data/train-00021-of-00022-8014350d27e6cde7.parquet")

In [None]:
print(verdict)

#Initialization

In [None]:
datasets = {
    "MNIST": (trainSetMNIST, testSetMNIST),
    "HSV-RGB": (trainSetHSVRGB, testSetHSVRGB),
    "Small 1x1": (small1x1[:int(len(small1x1) * 0.8)], small1x1[int(len(small1x1) * 0.8):]),
    "The Verdict": (verdict[:int(len(verdict) * 0.8)], verdict[int(len(verdict) * 0.8):])
}

**After the first run of the notebook, the samples in the Training tab, get set to 10 somehow. Just rerun the cell below and everything should work again.**

In [None]:
import Widgets
Widgets.initialize(trainSetMNIST, testSetMNIST, datasets)

**To use your settings, please click on the cell below and choose "Runtime -> Run cell and below" from the options menu of Colab**

# Selection Initialization

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }

datasetChoice = Widgets.datasetChoice.value

if(datasetChoice == "MNIST"):
    chosenDataSet = MNIST
elif(datasetChoice == "HSV-RGB"):
    chosenDataSet = HSVRGB
elif(datasetChoice == "Small 1x1"):
    chosenDataSet = Small1x1
elif(datasetChoice == "The Verdict"):
    chosenDataSet = Verdict

seed = ""
if (Widgets.seedChoice.value != "Random"):
    seed = Widgets.seedChoice.value
useBitLinear = Widgets.useBitLinearChoice.value

if datasetChoice in ["Small 1x1", "The Verdict"]:
    LLM_Layers, TransformerBlockLayer = chosenDataSet.setGPTSettings(Widgets.layerAmountChoice.value, Widgets.learningRateChoice.value, Widgets.epochsChoice.value)
    hidden_sizes = [[LLM_Layers[0]]]
    for _ in range(Widgets.layerAmountChoice.value):
        hidden_sizes.append(TransformerBlockLayer)
    hidden_sizes.append(LLM_Layers[1])
    hidden_sizes = [item for sublist in hidden_sizes for item in sublist]

else:
    hidden_sizes = [(normalLayer.value, normalLayerSize.value, activationLayer.value) for normalLayer, normalLayerSize, activationLayer in zip(Widgets.normalLayerChoice, Widgets.normalLayerSizeChoice, Widgets.activationLayerChoice)]
    hidden_sizes.append((Widgets.outputLayerChoice.value, 10, Widgets.outputActivationLayerChoice.value))

visualizeCustom = [((normalLayer.value[0], normalLayer.value[1]), activationLayer.value) for normalLayer, activationLayer in zip(Widgets.neuronChoice, Widgets.activationLayerTypeChoice)]
visualizeCustom.append(((Widgets.outputLayerSizeChoice.value[0], Widgets.outputLayerSizeChoice.value[1]), Widgets.outputLayerActivationChoiceType.value))

epochs = Widgets.epochsChoice.value
learning_rate = Widgets.learningRateChoice.value
train_samples = Widgets.trainSamplesChoice.value
test_samples = Widgets.testSamplesChoice.value
batch_size_training = Widgets.batchSizeTraining.value
batch_size_test = Widgets.batchSizeTest.value
loss_function = Widgets.lossChoice.value
optimizer = Widgets.optimizerChoice.value
eval_samples = Widgets.evalSamplesChoice.value

closestSources = Widgets.closestSourcesChoice.value if Widgets.closestSourcesChoice.value <= train_samples else train_samples
showClosestMostUsedSources = Widgets.showClosestMostUsedSourcesChoice.value
visualizationChoice = Widgets.visualizationChoice.value

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }

import numpy as np
from collections import defaultdict

dtype = np.dtype([('source', 'U20'), ('value', 'f8'), ('difference', np.float64)])

# Create a compatible fill_value
fill_value = np.array(('None', np.inf, np.inf), dtype=dtype)

#evaluationTest = np.full((train_samples+eval_samples, len(hidden_sizes), np.max([h[1] for h in hidden_sizes]), closestSources),fill_value=fill_value, dtype=dtype)

# Memory usage for each element (in bytes)
element_memory = 3 + 8 + 8  # Source (3 bytes) + value (8 bytes) + difference (8 bytes)

# Initialize total elements to 0
total_non_zero_elements = 0
total_non_zero_elements_in_list = 0

# Loop through layers and add the non-zero elements per layer
for layer in hidden_sizes:
    neurons_in_layer = layer[1]
    total_non_zero_elements += eval_samples * neurons_in_layer * closestSources
    total_non_zero_elements_in_list += eval_samples * neurons_in_layer * train_samples

# Total memory (in bytes)
total_storage_bytes = (total_non_zero_elements * 2) * element_memory
total_storage_for_list_bytes = (total_non_zero_elements_in_list * 2) * element_memory

# Convert to MB
total_storage_mb = total_storage_bytes / (1024 ** 2)
total_storage_for_list_mb = total_storage_for_list_bytes / (1024 ** 2)

print(f"Total memory usage for the sparse tensor: {total_storage_mb:.2f} MB")
print(f"Total memory usage for the list: {total_storage_for_list_mb:.2f} MB")

In [None]:
import Customizable_RENN as RENN
RENN.initializePackages(device, io, pd, pa, pq, zstd, seed, useBitLinear)

# Training

In [None]:
chosenDataSet.initializeDatasets(train_samples, test_samples, eval_samples, batch_size_training, batch_size_test, seed)
chosenDataSet.trainModel(hidden_sizes, loss_function, optimizer, learning_rate, epochs)

# Sourcecheck Customisable RENN

In [None]:
elapsed_time = timeit.timeit(
    lambda: chosenDataSet.initializeHook(hidden_sizes, train_samples),
    number=1  # Run it once
)
print(f"Time taken: {elapsed_time:.2f} seconds")

# Visualization Customisable RENN

In [None]:
elapsed_time = timeit.timeit(
    lambda: chosenDataSet.visualize(hidden_sizes, closestSources, showClosestMostUsedSources, visualizationChoice, visualizeCustom, True),
    number=1  # Run it once
)
print(f"Time taken: {elapsed_time:.2f} seconds")

**Only for HSV-RGB conversion: The visualization cell (cell above) must be run before, otherwise there will nothing be displayed!**

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }

if(datasetChoice == "HSV-RGB"):
    layerChoiceSlider = Widgets.createIntSlider(value=0, min=0, max=len(hidden_sizes)-1, description="Layer")
    neuronChoiceSlider = Widgets.createIntSlider(value=0, min=0, max=hidden_sizes[layerChoiceSlider.value][1]-1, description="Neuron")

    def update_sliders(*args):
        neuronChoiceSlider.max = hidden_sizes[layerChoiceSlider.value][1]-1
        clear_output()
        display(layerChoiceSlider, neuronChoiceSlider)

    layerChoiceSlider.observe(update_sliders, names='value')
    neuronChoiceSlider.observe(update_sliders, names='value')

    display(layerChoiceSlider, neuronChoiceSlider)

**Only for HSV-RGB conversion: Pick the layer and neuron you want to be displayed, then run the cell below to display it**

In [None]:
# @title Click `Show code` in the code cell. { display-mode: "form" }

if(datasetChoice == "HSV-RGB"):
    def update_visualization(*args):
        clear_output()
        chosenDataSet.visualize3DCube(closestSources, layerChoiceSlider.value, neuronChoiceSlider.value, hidden_sizes[layerChoiceSlider.value][1]-1)

    #Observing causes problems here, which are not fixable somehow (Windows disappearing)
    #layerChoiceSlider.observe(update_visualization, names='value')
    #neuronChoiceSlider.observe(update_visualization, names='value')

    update_visualization()