### KARPATHY NANO-GPT FOLLOW ALONG

### -- * [ Utilities Import ] * --

In [3]:
import sys
from pathlib import Path

# @note 🧠 : In Jupyter cell use `Path.cwd().parent` to get the parent directory 
# of the current working directory
parent_directory = str(Path.cwd().parent)
if parent_directory not in sys.path:
    sys.path.append(parent_directory)

from utils.dataframe.util_dataframe_table import UtilDataFrameTable as df_table

### -- [PLAYGROUND] -- character prediction model

In [2]:
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


### -- * [ Read Data From File ] * --

In [4]:
# read it to inspect it
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(f"length of dataset in characters : {len(text)}")


length of dataset in characters : 1115394


In [6]:
# let's look at the first 1000 characters
print(text[:100])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


### -- * [ Gather Unique Character Sets ] * --

In [5]:
# Return a new list containing all items from the iterable in ascending order
# - sorted() returns a new sorted list from an iterable
# - if text = 'Hello, world' the resulting chars list will be :
# [' ', ',', 'H', 'd', 'e', 'l', 'o', 'r', 'w']
chars = sorted(
    # Build an unordered collection of unique elements
    # - create a set from the 'text', an unordered collection of unique elements
    # ALL duplicate characters are removed from 'text' to achieve this
    set(text)
)
# here are all the unique characters in the file
print(
    "--[characters]--"
    "\n- 1st entry is the [space] character"
    "\n- 2nd entry is the [newline] character"
    f"\n{''.join(chars)}\n"
)
print("--[vocab size]--\n" f"\n  {len(chars)}")


--[characters]--
- 1st entry is the [space] character
- 2nd entry is the [newline] character

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

--[vocab size]--

  65


### -- * [ Tokenize ] * --

`Mapping` from `Characters to Integers` tradeoffs :

- `CHARACTER` to integer mapping => short integer sequence, LARGER vocabulary  
- `SUBWORD` tokenization => long integer sequence, SMALLER vocabulary
  
For this exercise, we will favor simpler `CHARACTER` tokenization, however large  
production systems would likely use `SUBWORD` tokenization

These mappings can be used to convert a string into a sequence of integers or  
vice versa, which is `useful for working with machine learning models` that  
`require numerical input`. 

> Note that the code snippet comments mention subword tokenization as an  
alternative technique often used in large production systems.

Subword tokenization can create :  
- a smaller vocabulary size  
- while preserving the information about common subwords and morphemes in the  
text.  

In [6]:
import numpy as np
import pandas as pd
from IPython.display import display

# Creates a dictionary of character to integer key/value pairs
# - using dictionary comprehension, the enumerate() function is used to loop
# over the 'chars' list and returns both [key]=character and [value]=ndex
stoi = {ch: i for i, ch in enumerate(chars)}  # string to integer
# itos simply reverses the key/value pairs of the stoi dictionary
itos = {i: ch for i, ch in enumerate(chars)}  # integer to string

flatten_array = [entry for pair in itos.items() for entry in pair]
table = df_table.array_to_table(np.array(flatten_array))
df_table.show_table(pd.DataFrame(table), label="itos")




--[itos]--


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,\n,1,,2,!,3,$,4,&,5,'
1,6,",",7,-,8,.,9,3,10,:,11,;
2,12,?,13,A,14,B,15,C,16,D,17,E
3,18,F,19,G,20,H,21,I,22,J,23,K
4,24,L,25,M,26,N,27,O,28,P,29,Q
5,30,R,31,S,32,T,33,U,34,V,35,W
6,36,X,37,Y,38,Z,39,a,40,b,41,c
7,42,d,43,e,44,f,45,g,46,h,47,i
8,48,j,49,k,50,l,51,m,52,n,53,o
9,54,p,55,q,56,r,57,s,58,t,59,u


True

### -- * [Encode and Decode] * --
- Character : Integer

In [8]:
# lambda function uses list comprehension to process each item in input and
# return a list of output items

# Encoder : takes a string (list of characters) and returns a list of integers
encode = lambda s: [stoi[c] for c in s]
# Decoder :takes a list of integers and outputs a string (list of characters)
decode = lambda l: "".join([itos[i] for i in l])

# input_phrase = 'six eggs and avocado skins forever lol'
input_phrase = "Bee3"
print(encode(input_phrase))
print(decode(encode(input_phrase)))

# our shakespeare text doesn't have any numerical characters other than 3
# so this should error out
input_phrase = "Bee7"
try:
    print(encode(input_phrase))
except KeyError as e:
    print(f"--[ERROR]--\n{e}")


[14, 43, 43, 9]
Bee3
--[ERROR]--
'7'


### -- * [ Generate Data Tensor ] * --

In [9]:
# let's encode the entire shakespeare text dataset and store it in a torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.int64)
print(
    f"--[data.shape]-- {data.shape}"
    f"\n--[data.dtype]-- {data.dtype}"
    "\n\n# the 15 characters we looked at earlier to GPT look like this :"
    f"\n\n{text[:15]}"
    f"\n{data[:15]}\n"
)

for entry in data[:15]:
    print(f"--[{entry}]-- {itos[int(entry)]}")


--[data.shape]-- torch.Size([1115394])
--[data.dtype]-- torch.int64

# the 15 characters we looked at earlier to GPT look like this :

First Citizen:

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0])

--[18]-- F
--[47]-- i
--[56]-- r
--[57]-- s
--[58]-- t
--[1]--  
--[15]-- C
--[47]-- i
--[58]-- t
--[47]-- i
--[64]-- z
--[43]-- e
--[52]-- n
--[10]-- :
--[0]-- 



### -- * [ Split Training and Validation Sets ] *--

In [10]:
# Let's split up the data into train and validation sets
n = int(0.9 * len(data))
train_data = data[:n]  # first 90% of the data is training data
val_data = data[n:]  # the rest is for validation


### **-- [Block Size] --**

When selecting the training data, you use `train_data[:block_size+1]` because  
you want to `predict` the `next` character after the `CURRENT BLOCK` from the  
`NEIGHBORING BLOCK`

By including one extra character, you provide the model with `both` the : 

- `input` sequence (the first 8 characters) and  
- the `target` character (the 9th character) to predict. 

This allows the model to `learn` the relationship between the `input` sequence  
and the `subsequent` character, which is essential for character-level language  
modeling tasks.

The choice of `block_size` is a trade-off because:

1. Processing `8 characters` at a time allows the model to learn relationships  
between characters within `shorter sequences`, which can help with  
`generalization` and computational `efficiency`.
2. Using a larger `block_size` would let the model learn longer-range  
dependencies between characters, but it would also increase the memory  
requirements and computational complexity of the model.

In this example, the training data is sliced to include `block_size + 1`  
characters, which is 9 characters in total. The `reason for adding 1` to the  
block size is that the model needs to `predict the next character` given the  
current sequence.  

So, the first `8 characters are the input sequence`, and the `9th character` is  
the `target to be predicted`.

Here's a brief explanation of the code:

- **`train_data`**: This is the entire training dataset containing character  
sequences.
- **`block_size`**: The number of characters to be considered at a time during  
training.
- **`train_data[:block_size+1]`**: This slice of the training data includes the  
first 9 characters. The first `8 characters are the input`, and the  
`9th character` is the `target to be predicted` by the model.

In summary, the `block_size` is set to 8 to balance the trade-off between  
processing characters in shorter sequences and learning relationships between  
characters in the text. The additional character added to the block size is  
needed for the model to predict the next character in the sequence.

Higher res notes on **`BLOCK`** and **`BATCH`** size trade offs 
***
Setting the `batch_size` and `block_size` in a character-based language model  
can affect the training and performance of the model. Here's an explanation of  
the trade-offs and other considerations when choosing these values:

1. **Batch size (`batch_size`):** The number of independent sequences processed  
in parallel.

   - *Pros:*
     - Larger batch sizes can lead to faster training as the model processes  
     more samples in parallel. This can potentially lead to better utilization  
     of GPU resources.
     - Larger batch sizes can provide a better estimate of the gradient during  
     each training step, which can lead to more stable convergence.

   - *Cons:*
     - Larger batch sizes require more memory. If the batch size is too large,  
     it may not fit in the GPU memory, leading to out-of-memory errors.
     - Sometimes, larger batch sizes can lead to worse generalization  
     performance. Smaller batch sizes introduce more `noise in the gradient`  
     updates, which can effectively act as a `form of regularization`.

2. **Block size (`block_size`):** The maximum context length for prediction.

   - *Pros:*
     - Larger block sizes allow the model to learn longer-range dependencies in  
     the text, which can potentially improve modeling performance.
     - Larger block sizes can lead to better utilization of GPU resources, as  
     more character sequences are processed in parallel.

   - *Cons:*
     - Larger block sizes require more memory. If the block size is too large,  
     it may not fit in the GPU memory, leading to out-of-memory errors.
     - Longer sequences may be more difficult for the model to learn, as the  
     gradients can have a harder time propagating through the entire sequence in  
     the backpropagation process (vanishing/exploding gradient problem).

It's essential to strike a balance between these trade-offs when choosing  
`batch_size` and `block_size`. You also need to consider other factors, such as:

- **Hardware limitations:** The GPU memory available on your system will limit  
the maximum batch size and block size you can use. You need to choose values  
that can efficiently utilize the available resources without causing  
out-of-memory issues.

- **Dataset size and complexity:** The size and complexity of your dataset can  
affect the optimal batch size and block size. For larger and more complex  
datasets, you may need to use larger block sizes to capture longer-range  
dependencies.

- **Model architecture:** The choice of model architecture can affect the  
optimal batch size and block size. For example, some architectures like  
`transformers can handle longer sequences more effectively` than recurrent  
neural networks (RNNs), so you might consider using larger block sizes with  
transformers.

In summary, choosing the right `batch` size and `block` size involves balancing  
various trade-offs :  

- considering hardware limitations,  
- dataset characteristics,  
- and model architecture.  

It's often helpful to experiment with different values to find the best   combination that works for your specific problem and setup.

### -- * [ Train Data Load ] * --

In [11]:
# `block_size` inlined to 8 because it's a good trade off to :
# - model to process 8 characters at a time
# - learn the relationships between characters in shorter sequences
block_size = 8  # characters to consider at a time, can't load entire file
# @note 🧠 : Ah ok, so we ARE actually INCLUDING a CHARACTER from the NEIGHBORING
# BLOCK in the TRAINING DATA
train_data[: block_size + 1]  # +1 because we need to predict the next character


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

### -- | Predicting Batches |--

Predicting (even across `BATCHES`) the next character in a sequence :

- Using a `sliding window` approach to demonstrates how to create input-output  
pairs for a character-level language model
- Which considers both the context of characters in the input sequence and the  
next character to be predicted 
- By `iterating` through the `input sequence` and `creating` `input-output pairs` for  
`each position`, the model learns to `predict` the `next character` based on the  
context of `previously seen characters`.

In [11]:
# The objective of the model is to predict the next character in a sequence
# given a context of previous characters
# TIME dimension : series of characters that are dependent on each other

# x_0 is the first `block_size` character batch from the `train_data`
# x_1 is the next 'block_size' character batch from `train_data`
x_0 = train_data[:block_size]
x_1 = train_data[block_size :block_size*2]
print(f"-- {x_0.shape} -- \n" f"-- [x_0] -- batch => (train_data[:block_size])\n{x_0}")

# @note 🧠 : passing -1 to reshape() will infer the correct dimension given :
# - the number of total elements in the tensor
# - the number of other dimensions that are specified
# Also we are inlining itos[] dictionary lookup function to the lambda function
df_table.show_table(x_0.reshape(-1, block_size), func=lambda x: itos[x])

print(f"-- [x_1] -- batch => (train_data[block_size :block_size*2])\n{x_1}\n")

df_table.show_table(x_1.reshape(-1, block_size), func=lambda x: itos[x])

# CREATES the output sequence `y` by shifting the `train_data` by one position
# to the right, effectively creating a sequence with the next character for
# each position in `x`
# - +1 to both start/end because we want to predict the next character
y = train_data[1 : block_size + 1]
# loop iterates over each position `t` in the input sequence `x
print(f" @t   context | x_0 | target | y[t] |\n" "----  -------         ------")
for t in range(block_size):
    # the `context` is created by taking the characters from BEGINNING of `x`
    # up to and INCLUDING the character at position `t`.
    context = x_0[: t + 1]
    # the corresponding target character for the current `context` is the
    # character at position `t` in the output sequence `y`
    target = y[t]
    print(
        f"[{t:2}] {context[-1]:6}     [{itos[int(context[-1])]}]  {target:5}     [{itos[int(target)]}]"
        f"    => {context} the target is {target}"
    )


-- torch.Size([8]) -- 
-- [x_0] -- batch => (train_data[:block_size])
tensor([18, 47, 56, 57, 58,  1, 15, 47])


Unnamed: 0,0,1,2,3,4,5,6,7
0,18,47,56,57,58,1.0,15,47
0,F,i,r,s,t,,C,i


-- [x_1] -- batch => (train_data[block_size :block_size*2])
tensor([58, 47, 64, 43, 52, 10,  0, 14])



Unnamed: 0,0,1,2,3,4,5,6,7
0,58,47,64,43,52,10,0,14
0,t,i,z,e,n,:,\n,B


 @t   context | x_0 | target | y[t] |
----  -------         ------
[ 0]     18     [F]     47     [i]    => tensor([18]) the target is 47
[ 1]     47     [i]     56     [r]    => tensor([18, 47]) the target is 56
[ 2]     56     [r]     57     [s]    => tensor([18, 47, 56]) the target is 57
[ 3]     57     [s]     58     [t]    => tensor([18, 47, 56, 57]) the target is 58
[ 4]     58     [t]      1     [ ]    => tensor([18, 47, 56, 57, 58]) the target is 1
[ 5]      1     [ ]     15     [C]    => tensor([18, 47, 56, 57, 58,  1]) the target is 15
[ 6]     15     [C]     47     [i]    => tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
[ 7]     47     [i]     58     [t]    => tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


This code snippet shows how to create a batch of input-output pairs for training or validation from the dataset using a random sampling approach. It defines a function `get_batch(split)` that generates a small batch of data based on the specified `split` ('train' or 'val'). Let's break down the code step by step.

1. `torch.manual_seed(1337)`: This line sets the random seed for PyTorch to ensure reproducibility of the results.

2. `batch_size` and `block_size` are set to 4 and 8, respectively. `batch_size` indicates the number of independent sequences to process in parallel, and `block_size` defines the maximum context length for prediction.

3. `get_batch(split)` function:
   - `data = train_data if split == 'train' else val_data`: This line sets the data source based on the specified `split`. If `split` is 'train', the `data` variable will be set to `train_data`. Otherwise, the `data` variable will be set to `val_data`.
   
   - `ix = torch.randint(len(data)-block_size-1, (batch_size,))`: This line generates a random offset within the dataset for each sequence in the batch. The offset is chosen so that the selected sequences do not exceed the dataset's length.
   
   - `x = torch.stack([data[i:i+block_size] for i in ix])`: This line creates a 2D tensor `x` containing the input sequences. For each random offset `i` in `ix`, it extracts a sequence of length `block_size` from the `data`, stacks these sequences as rows, and forms a 2D tensor of shape `(batch_size, block_size)`, i.e., 4 rows of 8 characters.
   
   - `y = torch.stack([data[i+1:i+block_size+1] for i in ix])`: This line creates a 2D tensor `y` containing the target sequences. It is similar to the previous line, but the extracted sequences are shifted by one character to the right, effectively creating a sequence of target characters for each input sequence in `x`.
   
The `get_batch` function returns a pair of tensors, `x` and `y`, which represent the input-output pairs for the character-level language model. This function allows you to generate batches of data for training or validation in a memory-efficient way using random sampling.

### -- * [ Get Batch Method ] * --

In [12]:
from typing import Tuple

# BATCH dimension : segments of chunk that are totally independent of each other
torch.manual_seed(1337)  # inlined to 'leet' to ensure reproducibility of the results
batch_size = 4  # how many independent sequences will we process in parallel
block_size = 8  # what is the maximum context length for prediction


# generate a small batch of data and returns 2 tensors : 
# - (inputs) x 
# - (targets) y
def get_batch(split) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Generate batches of data for training or validation in a memory-efficient
    way using random sampling :
    - `split` ('train' or 'val') generate data batch for TRAINING or VALIDATION (default)
    - returns a pair of tensors, `x` and `y`, which represent the input-output
    pairs for the character-level language model
    """
    # @audit 🍝 : `train_data` and `val_data` are GLOBAL variables
    data = train_data if split == "train" else val_data  # default == validation
    # Randomize offset to prevent Model from MEMORIZING character POSITION
    # - so we can GENERALIZE to ONLY neighboring characters
    # generate random offset for each sequence in the batch ...
    # - DO NOT EXCEED unit BATCH SIZE
    ix = torch.randint(len(data) - block_size - 1, (batch_size,))
    # creates a 2D tensor `x` containing the input sequences, where :
    # - for each random offset `i` in `ix`, it extracts a sequence of length
    # `block_size` from the `data`
    # - stacks these sequences as rows
    # - and forms a 2D tensor of shape `(batch_size, block_size)`
    # i.e.4 rows of 8 characters
    x = torch.stack([data[i : i + block_size] for i in ix])
    # y = next character i.e. x shifted by 1
    # - effectively creating a sequence of target characters for each input
    # sequence in `x`
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


### -- * [ Instantiate xb (input) yb (targets) ] * --

In [13]:
xb, yb = get_batch("train")
print(f"--[inputs]--\n xb.shape : {xb.shape}\n{xb}")
df_table.show_table(xb, func=lambda x: itos[x])


# @audit 🛑 : Explain how the last element of each yb row is generated
print(f"--[targets]--\n yb.shape : {yb.shape}\n{yb}")
df_table.show_table(yb, func=lambda x: itos[x])


--[inputs]--
 xb.shape : torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])


Unnamed: 0,0,1,2,3,4,5,6,7
0,53,59,6,1,58,56,47,40
0,o,u,",",,t,r,i,b
1,49,43,43,54,1,47,58,1
1,k,e,e,p,,i,t,
2,13,52,45,43,50,53,8,0
2,A,n,g,e,l,o,.,\n
3,1,39,1,46,53,59,57,43
3,,a,,h,o,u,s,e


--[targets]--
 yb.shape : torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])


Unnamed: 0,0,1,2,3,4,5,6,7
0,59,6,1,58,56,47,40,59
0,u,",",,t,r,i,b,u
1,43,43,54,1,47,58,1,58
1,e,e,p,,i,t,,t
2,52,45,43,50,53,8,0,26
2,n,g,e,l,o,.,\n,N
3,39,1,46,53,59,57,43,0
3,a,,h,o,u,s,e,\n


True

In [24]:
# print out the 32 independent examples (4 batches of 8 characters each)
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input (context) is {context} the target is {target}")


when input (context) is tensor([53]) the target is 59
when input (context) is tensor([53, 59]) the target is 6
when input (context) is tensor([53, 59,  6]) the target is 1
when input (context) is tensor([53, 59,  6,  1]) the target is 58
when input (context) is tensor([53, 59,  6,  1, 58]) the target is 56
when input (context) is tensor([53, 59,  6,  1, 58, 56]) the target is 47
when input (context) is tensor([53, 59,  6,  1, 58, 56, 47]) the target is 40
when input (context) is tensor([53, 59,  6,  1, 58, 56, 47, 40]) the target is 59
when input (context) is tensor([49]) the target is 43
when input (context) is tensor([49, 43]) the target is 43
when input (context) is tensor([49, 43, 43]) the target is 54
when input (context) is tensor([49, 43, 43, 54]) the target is 1
when input (context) is tensor([49, 43, 43, 54,  1]) the target is 47
when input (context) is tensor([49, 43, 43, 54,  1, 47]) the target is 58
when input (context) is tensor([49, 43, 43, 54,  1, 47, 58]) the target is 

### -- * [ BigramLanguageModel Class ] * --

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pdb

# set the seed for generating random numbers
# - we are manually setting to `1337` for reproducibility
torch.manual_seed(1337)

# @audit-ok : result is not captured in variable so it is NOT USED in our code?
# WE will comment it out @mike
# nn.CrossEntropyLoss()


class BigramLanguageModel(nn.Module):
    # contructor takes self and vocab_size as arguments
    # - vocab_size is the number of unique tokens in the vocabulary
    def __init__(self, vocab_size):
        super().__init__()
        # This line creates an embedding layer for the tokens in the vocabulary 
        # The embedding layer is essentially a lookup table where each token 
        # in the vocabulary is associated with a unique vector.
        # @audit-ok : Explain this ^ like I am 5
        # ANSWER
        # vocab  id  uid.vector
        # -----  --  -----------
        # King    0  [1.0,  0.7]
        # Queen   1  [1.0, -0.5]
        # Man     2  [0.0,  0.5]
        # Woman   3  [0.0, -0.5] 
        #
        # ------ graph of the embedding table ----
        #
        # King  (1,  0.7)      x 
        # Man   (0,  0.5)     x
        #
        # Queen (1, -0.5)  x
        # Woman (0, -0.5)  x
        #
        # ------ take a ways ----
        #
        # Looking at these points on a graph :
        # - -- horizontal --
        #   - "King" and "Queen" match
        #   - "Man" and "Woman" match
        # - -- vertical --
        #   - "King" and "Man" aligned **approximately**
        #   - "Queen" and "Woman" aligned **exactly**
        # - -- correlations --
        #   - Queens are ONLY FEMALE
        #   - Kings aren't exclusively MALE
        # TODO : when would num_embeddings != embedding_dim?
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        # pdb.set_trace()

    # forward() method defines how input is passed through the layers of the 
    # network
    def forward(self, idx, targets=None, debug=False):

        # | POPULATE LOGITS | 
        # @audit we MUST do this even if early exit ... why??
        
        # idx and targets are both (B,T) tensor of integers
        # @audit-ok : what are B and T and C?
        # ANSWER 
        # B = batch (number of sentences [4]) 
        # T = tokens (number of words [8] per sentence)
        # C = categories (unique vocabulary tokens [65])
        logits = self.token_embedding_table(idx)  # (B,T,C) ♿
        if debug :
            print("[BGL] forward() \n idx.shape @ token_embedding_table : ", idx.shape)
            print("[BGL] forward() \n logits.shape @ token_embedding_table", logits.shape)
            print("[BGL] forward() \n idx @ token_embedding_table : ", idx)
            print("[BGL] forward() \n logits @ token_embedding_table", logits)

        # | EARLY EXIT |
        # branching to support targets=None for generate function to call self
        # - This case might happen during inference, when we don't have or need 
        # target values.
        if targets == None:
            loss = None
        else:
            # | UNPACK AND CACULATE LOSS |
            # Manually unpack channels so we can specify order that pytorch uses
            B, T, C = logits.shape
            # refactor to 2D array by unrolling the batch and time dimensions
            logits = logits.view(B * T, C)
            # targets.view(-1) also works, -1 means 'infer'
            # TODO : understand how this -1 infer works
            # @audit-ok : what is the difference between .view() and .reshape()?
            # ANSWER
            # Both are used to reshape the tensor, the difference is that
            # - .view requires that tensor is contiguous in memory
            # - .reshape will handle a non-contigous tensor, if necessary
            # will copy the data to make it contigous, however this uses
            # extra memory and may not be what you want
            targets = targets.view(B * T)   # targets reshaped to (B * T, )
                                            # a 1D tensor of integers
            # ^^^ view/reshaping is to match cross entropy loss input 
            # requirements
            loss = F.cross_entropy(logits, targets)

        # pdb.set_trace()
        if debug:
            if targets == None:
                print("[BLG] forward() return targets NONE")
            else:
                print("[BGL] forward() \n idx.shape : ", idx.shape)
                print("[BGL] forward() return \n targets.shape : ", targets.shape)
                print("[BGL] forward() return \n  : logits.shape ", logits.shape)
                print("[BGL] forward() return \n  :  loss.shape ", loss.shape)
                print("[BGL] forward() \n idx : ", idx)
                print("[BGL] forward() return \n targets : ", targets)
                print("[BGL] forward() return \n  : logits ", logits)
                print("[BGL] forward() return \n  :  loss ", loss)
        return logits, loss

    # generates a sequence of tokens
    # - idx : input data, (B,T) array of indices token IDs to start generating
    # the next tokens from
    # - max_new_tokens : how many tokens to generate
    # - returns idx : the generated sequence of tokens < `max_new_tokens`
    def generate(self, idx, max_new_tokens, debug=False):
        if debug:
            logits, _ = self(idx)
            print("[BGL] generate() \n max_new_tokens : ", max_new_tokens)
            print("[BGL] generate() \n idx.shape : ", idx.shape)
            print("[BGL] generate() \n logits.shape : ", logits.shape)
            print("[BGL] generate() \n idx : ", idx)
            print("[BGL] generate() \n logits : ", logits)
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            # loss is ignored, only focusing on logits
            # TODO : why is loss ignored? can we format to not include it?
            # Also updating forward() to support default loss=None
            # self(idx) is nn.Module.__call()__ which effectively calls
            # - self.forward() with the same arguments : `idx`
            # - + additional hooks before and after `forward()` is called
            # - so not exactly the same as `self.forward()` but close enough
            logits, _ = self(idx) # we only want the logits, loss will be NONE
            # focus only on the last[-1] [T]ime step ... so essentially we are :
            # (B, T, C) -> (B * T, C) where B * T = B * T[-1] ... B != T[-1]
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        # pdb.set_trace()
        if debug:
            print("[BGL] generate() \n logits.shape : ", logits.shape)
            print("[BGL] generate() \n probs.shape : ", probs.shape)
            print("[BGL] generate() \n idx.shape : ", idx.shape)
            print("[BGL] generate() \n idx_next.shape : ", idx_next.shape)
            print("[BGL] generate() \n logits : ", logits)
            print("[BGL] generate() \n probs : ", probs)
            print("[BGL] generate() \n idx : ", idx)
            print("[BGL] generate() \n idx_next : ", idx_next)
        return idx


### -- * [ Instantiate BigramLanguage Model ] * --

In [15]:
# in video this value is vocab_size = 65, but here I'm hard coding by hand
# TODO : review what I missed in video with vocab_size
m = BigramLanguageModel(65)
# calling the 'forward' method of the model
# - pytorch allows us to call the model like a function
# - this is the same as calling m.forward(xb, yb)
logits, loss = m(xb, yb, debug=True)
# - logits = measure model's prediction
# - loss - used to update model's weights (backpropagation)

# TODO : why is printing {loss} format only returns value and not tensor?
print(f"logits.shape : {logits.shape} loss : {loss}")
# given -ln(1/65) = 4.174387269895637
print(loss)  # to check which grad_fn tensor is using


[BGL] forward() 
 idx.shape @ token_embedding_table :  torch.Size([4, 8])
[BGL] forward() 
 logits.shape @ token_embedding_table torch.Size([4, 8, 65])
[BGL] forward() 
 idx @ token_embedding_table :  tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
[BGL] forward() 
 logits @ token_embedding_table tensor([[[-0.1324, -0.5489,  0.1024,  ..., -0.8599, -1.6050, -0.6985],
         [-0.4002,  0.3302,  1.5454,  ...,  1.3688,  0.4620,  0.2040],
         [ 0.4160,  0.3362, -0.4512,  ..., -1.6525, -0.8816, -1.4546],
         ...,
         [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296],
         [ 1.6515, -0.0424, -0.7355,  ...,  0.8682,  2.0593, -0.8159],
         [ 1.4311,  0.4160, -2.2246,  ...,  0.7330,  0.3551,  0.1472]],

        [[-0.2950, -0.6511,  1.4937,  ..., -0.5250,  1.2672,  2.6002],
         [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594

#### -- | PreTraining Generate | --

In [None]:
# BEFORE training
# creates a 2D tensor full of zeros with shape (1, 1)
# - tensor is of **type long** because it is **intended to hold integer** values
# - tensor is of **shape (1, 1)** because it is **intended to hold a sequence of 
# tokens**
# - idx is used as the **initial sequence of tokens** to feed into the model for 
# generating new tokens.
idx = torch.zeros((1, 1), dtype=torch.long)
# output is random garbage because we haven't trained the model yet
print(decode(m.generate(idx, max_new_tokens=100, debug=True)[0].tolist()))


### -- * [ Instantiate Optimizer] * --

In [16]:
# create pytorch optimizer
# TODO : what is AdamW? And why is it better than SGD?
# Answer : the main difference between SGD and AdamW is :
# - AdamW incorporates weight decay into the optimizer step size, weight decay is
#   is a regularization technique that helps prevent overfitting by ensuring that
#   weights remain close to their initial values
# - AdamW also incorporates momentum into the optimizer step size -- TODO: what is momentum?
# - SGD is STOCHASTIC, meaning that it takes small steps towards the minimum of
#   the loss function, whereas AdamW is ADAPTIVE which takes larger steps AND
#   adjusts step size according to the current state of the system
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


### -- * [ Train Loop Cycle ] * --

In [17]:
batch_size = 4 # why did we change batch size from 4 to 32

# @audit : This loop takes a ton of time??
# @ 1,000 steps      : loss ~ 4.3 ... but repeatedly running can ~ 2.6
                       # @audit ... is each step leaving RESIDUE to build up ???
# - with batch = 4   : 1 seconds   tensor(2.6720, grad_fn=<NllLossBackward0>)

# @ 10,000 steps     : loss ~ 2.6
# - with batch = 4   : 2.5 seconds   tensor(2.6720, grad_fn=<NllLossBackward0>)

# @ 100,000 steps    : loss ~ 2.4
# - with batch = 4   : 22 seconds   tensor(2.4284, grad_fn=<NllLossBackward0>)

# @ 1,000,000 steps  : loss ~2.4
# - with batch = 4   : 4 minutes seconds   tensor(2.4611, grad_fn=<NllLossBackward0>)

# @ 10,000,000 steps : loss ~2.5
# - with batch = 4   : 37 minutes seconds   tensor(2.5116, grad_fn=<NllLossBackward0>)

# ANSWER
# - yes for 100,000 steps : 
# - time : 1minute 17 seconds
# - loss : ~ 2.406831979751587


for steps in range(10000): # loss ~ 2.6
    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = m(xb, yb, debug=False)
    # zeroing out gradient from previous step
    # set_to_none
    # - True  : biased to MEMORY efficiency, LARGER models are MEMORY gated
    # - False : biased to CPU efficiency, uses more MEMORY by skipping clean up
    optimizer.zero_grad(set_to_none=True)
    # getting gradient of loss wrt to model parameters
    loss.backward()
    # using gradient to update model parameters
    # - embedded_table updated
    # - @audit ... are weights and biases ALSO updated for BIGRAM model?
    optimizer.step()

if loss is not None:
    print(loss)


tensor(2.6720, grad_fn=<NllLossBackward0>)


#### -- | Post Training Generate | --

In [18]:
# AFTER training
idx = torch.zeros((1, 1), dtype=torch.long)
# increasing the max_new_tokens in addition to training is starting to get
# reasonable output
print(decode(m.generate(idx, max_new_tokens=300, debug=True)[0].tolist()))


[BGL] generate() 
 max_new_tokens :  300
[BGL] generate() 
 idx.shape :  torch.Size([1, 1])
[BGL] generate() 
 logits.shape :  torch.Size([1, 1, 65])
[BGL] generate() 
 idx :  tensor([[0]])
[BGL] generate() 
 logits :  tensor([[[ 2.1517e+00, -4.6084e+00, -4.8821e+00, -5.4069e+00, -3.9509e+00,
          -7.8209e-01, -3.6401e+00, -4.4815e+00, -4.2010e+00, -2.0870e+00,
          -5.8173e+00, -5.0243e+00, -4.3193e+00,  1.3392e+00,  3.5719e-01,
           5.4461e-01,  2.2572e-01, -4.8994e-01,  5.4563e-01,  1.2686e-01,
          -2.4877e-01,  1.0518e+00, -6.0338e-01, -1.2103e-01,  2.6311e-01,
          -6.6172e-02, -1.7088e-01,  2.4896e-01, -1.8473e-04, -9.9460e-01,
          -2.0433e-02,  8.3727e-01,  1.3871e+00, -7.9710e-01, -8.2070e-01,
           1.0448e+00, -4.3926e+00,  1.0927e-01, -5.6323e+00, -1.1992e+00,
          -1.4673e+00, -1.4421e+00, -1.3720e+00, -2.5076e+00, -2.0195e+00,
          -1.8701e+00, -1.1588e+00, -1.7623e+00, -2.4397e+00, -2.3857e+00,
          -1.6633e+00, -1.2070e

In [17]:
torch.cuda.is_available()


False

#### -- [VERSION 1] -- self-attention toy example
***

In [23]:
# ---VERSION 1--- self-attention toy example
torch.manual_seed(1337)
# T : a word is stored at each time step/location?
# TODO : Verif ^^^
B, T, C = 4, 8, 2  # batch, time, channels
x = torch.randn(B, T, C)
x.shape


torch.Size([4, 8, 2])

In [24]:
# How do we store the relationship up to the current character, but not beyond?
# We want x[b,t] = mean_{i<=t} x[b,i]
# xbow = bag of words
xbow = torch.zeros(B, T, C)
for b in range(B):
    for t in range(T):
        # up to AND including current (t) time step
        xprev = x[b, : t + 1]  # (t,C)
        # mean = average
        xbow[b, t] = xprev.mean(dim=0)  # (C)

# Verify that each ROW of xbow progressively AVERAGES towards the SUM of x[0]
# TODO : is this the KEY to understanding ATTENTION^?
# x[0] row[0] ------- row[1] -------  --id(column)--
#              0.18           -0.07     0
#             -0.35           -0.92     1
#              0.63            0.03     2
#              0.95            0.06     3
#              0.36            1.17     4
#             -1.35           -0.51     5
#              0.24           -0.24     6
#             -0.92            1.54     7
# ----------------------------------------------
# .18-.35+.63+.95+.36-1.35+.24-.92
# -.07-.92+.03+.06+1.17-.51-.24+1.54
# ----------------------------------------------
# SUMMED   :  -0.26            1.06
# AVERAGED :  -0.03            0.13
# --- Manually adding and averaging the rows of x[0] ~ xbow[0] ---

print(f"----x[0]-----\n{x[0]}")
print(
    f"----xbow[0]----------------\n...\n # @note: 1st row == x[0]\n...\n{xbow[0]}\n...\n # rows progressively AVERAGE towards\n # the SUM of x[0] FULL tensor"
)


----x[0]-----
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
----xbow[0]----------------
...
 # @note: 1st row == x[0]
...
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
...
 # rows progressively AVERAGE towards
 # the SUM of x[0] FULL tensor


In [28]:
# We can speed this up using matrix operations
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

df_table.show_table(a, label='a')
df_table.show_table(b, label='b')
df_table.show_table(c, label='c = a @ b')

--[a]--


Unnamed: 0,0,1,2
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0


--[b]--


Unnamed: 0,0,1
0,2.0,7.0
1,6.0,4.0
2,6.0,5.0


--[c = a @ b]--


Unnamed: 0,0,1
0,14.0,16.0
1,14.0,16.0
2,14.0,16.0


True

##### WOW Triangular matrix is essentially a CORNER b&w PIXEL image! WOW
- Generates a **`LOWER LEFT TRIANGLE`** matrix of 1s, and flood fills the rest with 0s

In [29]:
a_tri = torch.tril(torch.ones(3, 3))
c = a_tri @ b

df_table.show_table(a_tri, label="a_tri")
df_table.show_table(b, label='b')
df_table.show_table(c, label='a_tri @ b')

# @audit 🛑 : Explain what is happening here
a_tri_sum = a_tri / torch.sum(a_tri, 1, keepdim=True)
df_table.show_table(a_tri_sum, label="a_tri/ torch.sum(a_tri, 1, keepdim=True)")

--[a_tri]--


Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,1.0,0.0
2,1.0,1.0,1.0


--[b]--


Unnamed: 0,0,1
0,2.0,7.0
1,6.0,4.0
2,6.0,5.0


--[a_tri @ b]--


Unnamed: 0,0,1
0,2.0,7.0
1,8.0,11.0
2,14.0,16.0


--[a_tri/ torch.sum(a_tri, 1, keepdim=True)]--


Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,0.5,0.5,0.0
2,0.333333,0.333333,0.333333


True

In [30]:
weights = torch.tril(torch.ones(T, T))
print(f"--[starting weights]--\n{weights}\n")
weights = weights / torch.sum(weights, 1, keepdim=True)
print(f"--[weighted aggregation]--(summed and averaged)\n{weights}")


--[starting weights]--
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

--[weighted aggregation]--(summed and averaged)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]]

#### -- [VERSION 2] -- loop => matrix weighted aggregation
***

In [31]:
# ---VERSION 2--- loop => matrix weighted aggregation
# (T,T) @ (B,T,C) -----> (B,T,C)
# pytorch implicitly adds a B : (B, T,T) @ (B,T,C) -----> (B,T,C)
# TODO : understand the above ^^^ and explicitly describe why it works
xbow_matrix = weights @ x
is_xbow_matrix_equal = torch.allclose(xbow, xbow_matrix)
print(f"is xbow and xbow_matrix equal? : {is_xbow_matrix_equal}")
print(f"xbow_matrix.shape : {xbow_matrix.shape}")
print(f"x.shape : {x.shape}")
print(f"xbow_matrix : {xbow_matrix[0]}")
print(f"xbow : {xbow[0]}")
print(f"x : {x[0]}")


is xbow and xbow_matrix equal? : True
xbow_matrix.shape : torch.Size([4, 8, 2])
x.shape : torch.Size([4, 8, 2])
xbow_matrix : tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
xbow : tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
x : tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])


#### -- [VERSION 3] -- softmax and tril mask
***

In [32]:
# ---VERSION 3--- softmax and tril mask
tril = torch.tril(
    torch.ones(T, T)
)  # why did we do this if we were going to zero it out anyways?
print(f"--[tril]--\n{tril}\n")
# --- [IMPORTANT AFFINITY] ---
# in this toy example we are zeroing it out, but in practice there would be
# latent values from previous time (or batch? which is it?) steps
# Allows nearest neighbor affinity to be learned
# CURRENTLY we are ZEROING it out INLINE, but in PRODUCTION they will LIKELY
# have NON-ZERO values when we develop our SELF-ATTENTION BLOCKS
# --- [IMPORTANT AFFINITY] ---
# - currently we are zeroing it out INLINE
# - for self-attention ... TODO : describe what we will do INSTEAD
w_softmax = torch.zeros(T, T)
# print(f'--[w_softmax]--(zeros)\n{w_softmax}\n')

# MASKING the future from influencing the past
# - decoder : we want to PREDICT the future, can't do that if we KNOW the future
# - encoder : DELETE this MASKING step, we want to learn the ENTIRE sequence
# TODO : why do we set to -inf?
# ANSWER : we are trying to prevent the future from influencing the past
# - setting to -inf when [exponentiated] will result in a value of 0
# TODO : how did w_softmax go from zero to tril?
# ANSWER : Ah we passed in tril as a mask and where == 0 we set to -inf!
# -- Tensor.masked_fill_(mask, value)
# Fills elements of self tensor with value where mask is True. The shape of mask
# must be broadcastable with the shape of the underlying tensor.
w_softmax = w_softmax.masked_fill(tril == 0, float("-inf"))
print(
    f"--[w_softmax]--(-inf) \n# (1) init to zero \n# (2) use tril as a mask : where 0 == -inf\n{w_softmax}\n"
)
# TODO : understand and explain this normalization filter
# ANSWER : we are normalizing the values in the matrix to be between 0 and 1
#
# [w_softmax]        [exponentiated]   [normalized]
#
# [0, -inf, -inf] => [1, 0, 0]         => [1.0, 0.0, 0.0]
# [0, 0, -inf]    => [1, 1, 0]         => [0.5, 0.5, 0.0]
# [0, 0, 0]       => [1, 1, 1]         => [0.3, 0.3, 0.3]
#
#   Softmax is defined as:
#   :math:`\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}`
#   It is applied to all slices along dim, and will re-scale them so that the
#   elements lie in the range `[0, 1]` and sum to 1.
# TODO : what does it mean to be exponentiated?
# ANSWER : exponentiated means to raise to the power of e
#          e is the base of the natural logarithm
# TODO : understand what it means to raise to the power of e
# ANSWER : When [exponentiated] and then [normalized] :
# [exponentiated] : [0, 0, -inf]    => [1, 1, 0]
# - raised to the power of 0 == 1
# - raised to the power of -inf == 0
# [normalized] : [0.5, 0.5, 0.0]
w_softmax = F.softmax(w_softmax, dim=-1)
print(f"--[w_softmax]--(softmax)\n{w_softmax}\n")
# sum and aggregate wrt affinity -- effectively a simple average between each
# token/character and it's previous neighboring characters up to current token
xbow_softmax = w_softmax @ x


--[tril]--
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

--[w_softmax]--(-inf) 
# (1) init to zero 
# (2) use tril as a mask : where 0 == -inf
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

--[w_softmax]--(softmax)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]

In [33]:
xbow_softmax[0][0]


tensor([ 0.1808, -0.0700])

#### -- [VERSION 4] -- Self-Attention
***

In [34]:
# ---VERSION 4--- Self-Attention
torch.manual_seed(1337)
# @audit-ok 🔎 : explain why channel size is 32
# in image space the equivalent would be ~ :
# - B,T => would be width, height (x,y pixels)
# - C => Grayscale (1) or RGB (3) or RGBA (4)
B, T, C = 4, 8, 32  # batch (pools:4), time (nodes:8), channels (features?:32)
# initialize x as a random tensor of shape (B,T,C) => torch.Size([4, 8, 32])
x = torch.randn(B, T, C)
# print(f'--[x[0][0]]-- (init as random tensor) => {x.shape}\n{x[0][0]}')
# @audit ⊹ --[x[0][0]]-- (init as random tensor) => torch.Size([4, 8, 32])

# @note Let's see a single head perform self-attention
head_size = 16  # hyperparameter
# @audit-ok ☑ : explain why we need 3 nn.Linear functions init, can we do it in 1?
# nn.Linear(in_features, out_features, bias=True) :
# - Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
""" 
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Examples::

        >>> m = nn.Linear(20, 30)
        >>> input = torch.randn(128, 20)
        >>> output = m(input)
        >>> print(output.size())
        torch.Size([128, 30])
"""
# ANSWER : There is NO communication between the key, query, and value vectors
# so we need 3 separate nn.Linear functions to capture each INDEPENDENT vector
# @audit 🛑 : why nn.Linear to convert @x from shape (B,T,C) to (B,T,head_size)?
# - out_x.shape : torch.Size([4, 8, 32]) out_v.shape : torch.Size([4, 8, 16])
# @note : Every token/node emits two vectors : key and query
# @note : "affinity" is the dot product of the query and key vectors
#         the more the query and key are ALIGNED the higher the affinity
# @note : key vector captures "what do I contain?"
key = nn.Linear(C, head_size, bias=False)
# @note : query vector captures "what am I looking for?"
query = nn.Linear(C, head_size, bias=False)
# @note : value vector captures "what do I need to know?"
# @audit-ok 🔎 : explain why we need value vector ... is it something that comes
# into play when we have multiple heads?
value = nn.Linear(C, head_size, bias=False)
k = key(x)  # (B,T,16 (head_size))
# @audit ⊹ : nn.Linear(x) is tensor size [4,8,32]
# print(f'--[k]-- => {k[0].shape}\n{k[0]} ')
# @audit 🖶 --[k]-- => torch.Size([8, 16])
""" 
tensor([[ 0.1196, -0.3013,  0.3629,  1.1771,  1.1385, -0.2554,  0.1454, -0.2944,
         -0.7020, -1.0308,  0.7436, -0.8098, -0.6669,  0.0912, -0.0061,  0.1983],
        [-0.5423, -0.5558, -0.0761,  1.2929,  0.8653, -1.1998,  0.3878,  0.1939,
          0.7024, -0.8225,  0.2348, -0.8499, -0.3813, -0.2991,  0.0102, -0.5545],
        [-0.3736, -0.4678, -0.2156, -0.8034, -0.3715, -0.5443, -0.9146, -0.0559,
         -0.3290, -0.2102,  0.1166, -0.1798, -0.2820, -0.3320, -0.4596, -0.1325],
        [-0.3146,  0.0845, -0.1235, -0.7058, -0.1802,  0.5492, -0.8980, -0.4938,
          0.6791,  0.8827,  0.4911,  0.5190,  0.9011,  0.0913, -0.1933, -0.6770],
        [ 0.0239,  0.0998, -0.1871, -0.0860, -0.4881, -1.6765,  0.2413,  0.7361,
          0.4608, -0.8722, -0.4259, -1.1347, -1.0571, -0.9401,  0.1343, -0.0157],
        [-0.2362, -0.7873, -0.3802,  0.5815, -0.3722,  1.2405, -0.7004, -1.4917,
          0.7678,  0.3584,  0.6120, -0.0794,  0.5983,  0.2635,  0.6490,  0.0709],
        [-0.7941, -0.1660, -0.2810, -0.1021, -0.7352, -0.7518, -0.1276, -0.0051,
          0.3325, -0.3374,  0.1678,  0.3105,  0.2258,  0.1243,  0.4617,  0.2016],
        [ 0.1651, -0.1599, -0.5717, -0.3957,  0.3930, -0.8567,  0.3390, -0.7977,
          0.2213, -0.5161,  0.1850, -0.2105,  0.3779,  0.0482, -0.4744, -0.0504]],
       grad_fn=<SelectBackward0>)
"""
# print(f'--[k.transpose]-- => {k[0].transpose(-2,-1).shape}\n{k[0].transpose(-2,-1)}')
# @audit 🖶 --[k.transpose]-- => torch.Size([16, 8])
# @audit-ok 🔎 explain k[0].transpose(-2, -1) **[16,8]** transformed from  k[0] = **[8,16]**
"""
tensor([[ 0.1196, -0.5423, -0.3736, -0.3146,  0.0239, -0.2362, -0.7941,  0.1651],
        ...
       grad_fn=<TransposeBackward0>)
"""
# @audit : **[grad_fn=<TransposeBackward0>)]** from **[grad_fn=<SelectBackward0>]**
q = query(x)  # (B,T,16 (head_size))
# print(f'--[q]-- => {q[0].shape}\n{q[0]}')
# @audit 🖶 --[q]-- => torch.Size([8, 16])
""" 
tensor([[-0.6567,  0.0283,  0.0094, -0.6995, -0.3604,  0.8376, -0.4446,  0.1228,
          0.6276, -0.6222,  0.3483,  0.2411,  0.5409, -0.2605,  0.3612, -0.0436],
        [-0.3932,  0.8220, -0.7027,  0.0954, -0.1222, -0.1518, -0.5024, -0.4636,
          0.1176,  1.4282, -0.5812,  0.1401,  0.9604,  0.0410, -0.6214, -0.6347],
        [ 0.2157, -0.3507,  0.0022,  0.4232, -0.2284, -0.0732, -0.3412,  0.9647,
         -0.5178,  0.0921, -0.5043,  0.8388,  0.6149, -0.0109, -0.5569,  0.5820],
        [ 0.9000, -0.1272,  0.5458,  0.4254, -0.4513, -0.0212,  0.1711,  0.2599,
         -0.9978,  0.4890,  0.1737, -0.0700, -0.3113,  0.3748, -0.1848, -0.6379],
        [ 0.0332,  0.5886, -0.4437,  0.3775, -0.6826, -0.2775,  0.4673, -1.2956,
          0.6603,  0.1633, -1.7573, -0.6582, -0.2302, -0.0862, -0.0060,  0.7573],
        [ 0.2098,  0.0439, -0.0702,  0.0727, -0.2012, -1.7539,  1.0369,  0.1163,
          0.2956,  0.3231,  0.5052,  0.7011, -0.2844, -0.7844,  0.4782, -0.5170],
        [ 0.6100, -0.3284, -0.8557,  0.8543,  0.7805, -0.4023, -0.8183, -0.0554,
          0.1873,  0.2706, -0.7066, -0.8637,  0.6998, -0.0670,  0.2551,  0.2149],
        [ 0.1459,  0.1349, -0.2335, -0.0417,  0.2928, -0.5080,  0.1177,  0.1861,
          0.1455,  0.0292, -0.8470,  0.6116,  1.2445,  0.1909,  0.3694, -0.0027]],
       grad_fn=<SelectBackward0>)
"""
# TODO : dot product between query (search) and key (data) somehow results in attention?
# ANSWER : ATTENTION is the DOT PRODUCT of query and key
# @note : FINALLY we compute ATTENTION => the dot product of query and key
w_self_attention = q @ k.transpose(-2, -1)  # (B,T,16) @ (B,16,T) --> (B,T,T)
# print(f'--[self_attention[0]]-- (q @ k) => {w_self_attention[0].shape}\n{w_self_attention[0]}')
# @audit 🖶 --[self_attention[0]]-- (q @ k) => torch.Size([8, 8])
"""
tensor([[-1.7629, -1.3011,  0.5652,  2.1616, -1.0674,  1.9632,  1.0765, -0.4530],
        [-3.3334, -1.6556,  0.1040,  3.3782, -2.1825,  1.0415, -0.0557,  0.2927],
        [-1.0226, -1.2606,  0.0762, -0.3813, -0.9843, -1.4303,  0.0749, -0.9547],
        [ 0.7836, -0.8014, -0.3368, -0.8496, -0.5602, -1.1701, -1.2927, -1.0260],
        [-1.2566,  0.0187, -0.7880, -1.3204,  2.0363,  0.8638,  0.3719,  0.9258],
        [-0.3126,  2.4152, -0.1106, -0.9931,  3.3449, -2.5229,  1.4187,  1.2196],
        [ 1.0876,  1.9652, -0.2621, -0.3158,  0.6091,  1.2616, -0.5484,  0.8048],
        [-1.8044, -0.4126, -0.8306,  0.5898, -0.7987, -0.5856,  0.6433,  0.6303]],
       grad_fn=<SelectBackward0>)
"""

# create a triangle mask of shape (T,T)) => torch.Size([8, 8])
"""
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
"""
# @audit-ok 🔎 : explain why is the dimension of the mask (T,T) and not (B,T,T) ?
# - is it because T (8) is larger than B (4) for tensor BROADCASING ?
tril = torch.tril(torch.ones(T, T))
# @audit-ok ☑ : DO NOT ZERO it out, else we are simpling AVERAGE per row, effectively
#         preventing building up ATTENTION in a data dependent way
# w_self_attention = torch.zeros(T,T)

# @note : mask prevents future from influencing the past for DECODING
# - we are trying to build a PREDICTIVE model, so masking the future makes sense
# - but... if this were SENTIMENT ANALYSIS, we would NOT want to mask the future
#   for that use case we may elect to include the future in the attention weights
# ENCODER block, DELETE this mask and allow all nodes to talke to each other
w_self_attention = w_self_attention.masked_fill(tril == 0, float("-inf"))
# print(f'--[self_attention[0]]-- (masked_fill) => {w_self_attention[0].shape}\n{w_self_attention[0]}')
# --[self_attention[0]]-- (masked_fill) => torch.Size([8, 8])
"""
tensor([[-1.7629,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-3.3334, -1.6556,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.0226, -1.2606,  0.0762,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 0.7836, -0.8014, -0.3368, -0.8496,    -inf,    -inf,    -inf,    -inf],
        [-1.2566,  0.0187, -0.7880, -1.3204,  2.0363,    -inf,    -inf,    -inf],
        [-0.3126,  2.4152, -0.1106, -0.9931,  3.3449, -2.5229,    -inf,    -inf],
        [ 1.0876,  1.9652, -0.2621, -0.3158,  0.6091,  1.2616, -0.5484,    -inf],
        [-1.8044, -0.4126, -0.8306,  0.5898, -0.7987, -0.5856,  0.6433,  0.6303]],
       grad_fn=<SelectBackward0>)
"""
# @audit-ok 🔎 : explain why is dim -1?
w_self_attention = F.softmax(w_self_attention, dim=-1)
print(
    f"--[self_attention[0]]-- (softmax) => {w_self_attention[0].shape}\n{w_self_attention[0]}"
)
# @audit 🖶 --[self_attention[0]]-- (softmax) => torch.Size([8, 8])
# @audit-ok ☑
# - tril mask ✋ => rows don't CHEAT read at the FUTURE, N row stopped to N column
# - softmax 💬 => each row sums to 1.0
"""
tensor([[1.0000 ✋, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]    , 💬
        [0.1574, 0.8426 ✋, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]    , 💬
        [0.2088, 0.1646, 0.6266 ✋, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]    , 💬
        [0.5792, 0.1187, 0.1889, 0.1131 ✋, 0.0000, 0.0000, 0.0000, 0.0000]    , 💬
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909 ✋, 0.0000, 0.0000, 0.0000]    , 💬
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019 ✋, 0.0000, 0.0000]    , 💬
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329 ✋, 0.0000]    , 💬
        [0.0210, 0.0843, 0.0555, 0.2297👀, 0.0573, 0.0709, 0.2423, 0.2391👀 ✋]], 💬
       grad_fn=<SelectBackward0>)

# @audit-ok 🔎 : Let's look 👀 at the eigth token of self_attention[0]
# - 👀 [8th token] is [0.2391]
# -[0.2297] 👀 has high affinity and is interesting to the [8th token]
# @audit 🛑 ... !!! I STILL DON'T GET IT 😓😰😅 !!!
"""

# out = w_self_attention @ x
# audit-ok 🔎 : explain how @ x is looking up PRIVATE information, but somehow
# @ v aggregates it?
# @audit 🛑 : What ... the tensor shape changes from [4, 8, 32] to [4, 8, 16] when we do @ v
# out_x.shape : torch.Size([4, 8, 32]) out_v.shape : torch.Size([4, 8, 16])
v = value(x)
out = w_self_attention @ v
# @audit 🖶 --[out[0]]-- (w_self_attention @ v) => torch.Size([8, 16])
"""
tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296, -0.1089,
         -0.0493,  0.7268,  0.7130, -0.1164,  0.3266,  0.3431, -0.0710,  1.2716],
        [ 0.4823, -0.1069, -0.4055,  0.1770,  0.1581, -0.1697,  0.0162,  0.0215,
         -0.2490, -0.3773,  0.2787,  0.1629, -0.2895, -0.0676, -0.1416,  1.2194],
        [ 0.1971,  0.2856, -0.1303, -0.2655,  0.0668,  0.1954,  0.0281, -0.2451,
         -0.4647,  0.0693,  0.1528, -0.2032, -0.2479, -0.1621,  0.1947,  0.7678],
        [ 0.2510,  0.7346,  0.5939,  0.2516,  0.2606,  0.7582,  0.5595,  0.3539,
         -0.5934, -1.0807, -0.3111, -0.2781, -0.9054,  0.1318, -0.1382,  0.6371],
        [ 0.3428,  0.4960,  0.4725,  0.3028,  0.1844,  0.5814,  0.3824,  0.2952,
         -0.4897, -0.7705, -0.1172, -0.2541, -0.6892,  0.1979, -0.1513,  0.7666],
        [ 0.1866, -0.0964, -0.1430,  0.3059,  0.0834, -0.0069, -0.2047, -0.1535,
         -0.0762,  0.3269,  0.3090,  0.0766,  0.0992,  0.1656,  0.1975,  0.7625],
        [ 0.1301, -0.0328, -0.4965,  0.2865,  0.2704, -0.2636, -0.0738,  0.3786,
          0.0746,  0.0338,  0.0147,  0.3194,  0.2993, -0.1653, -0.0386,  0.3375]],
       grad_fn=<SelectBackward0>)
"""
print(f"--[out[0]]-- (w_self_attention @ v) => {out[0].shape}\n{out[0]}")


--[self_attention[0]]-- (softmax) => torch.Size([8, 8])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)
--[out[0]]-- (w_self_attention @ v) => torch.Size([8, 16])
tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296, -0.1089,
         -0.04

In [35]:
out_x = w_self_attention @ x
out_v = w_self_attention @ v
print(f"out_x.shape : {out_x.shape} out_v.shape : {out_v.shape}")


out_x.shape : torch.Size([4, 8, 32]) out_v.shape : torch.Size([4, 8, 16])


#### -- [FOOTNOTES] --
***

Notes :
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space.  Attention simply acts over a set of vectors.  This is why we need to positionally encode tokens.  `# TODO - is CONVOLUTION an IMPLICIT attention mechanism, where nearest neighbors are the IMPLICIT positional embedding`
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- "self-attention" just means that the keys and values are produced from the same source as queries.  In "cross-attention" the `???` get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- In an "encoder" attention block just **delete the single line that does masking** with `tril`, allowing all tokens to communicate. `???` here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings like `???` modeling.
- "Scaled" attention divides `wei` by 1/sqrt(`head_size`).  This makes is so when input Q,K are unit variance, `wei` will be `???` too and Softmax will stay diffuse and not saturate too much. Illustration below `???`

##### Explain Footnotes ^ like I am 5 

Okay, let's break it down into simpler terms:

1. **Attention**: Imagine you're in a classroom where everyone is talking at once, and you want to focus on what your best friend is saying. You would "pay more attention" to the sounds coming from your friend, and less to the other noise. That's what the attention mechanism does - it pays more attention to the important parts of the data and less to the less important parts.

2. **No notion of space**: Attention doesn't care about where things are in relation to each other. Imagine you are playing with a set of different toys - a car, a doll, a ball. You don't care where these toys are placed, you just play with each toy based on how much you like it. That's why we need to tell the model about the order of words in a sentence, because it doesn't understand this on its own.

3. **Each example is processed independently**: Think of it like doing your homework - you do each problem on its own, without mixing up the information from different problems.

4. **Self-attention and cross-attention**: Let's say you're trying to understand a story. If you think about the story and come up with questions and answers all by yourself, that's like self-attention. But if you get help from your teacher or a book (an external source) to understand the story, that's like cross-attention.

5. **Encoder and decoder**: Imagine you're playing a game of telephone. The person who whispers the message into the next person's ear is like the encoder, and the person who tries to understand and pass along the message is like the decoder. In some cases, we allow all the players to talk to each other freely (like the encoder), and in others we only allow each player to hear from the players who spoke before them (like the decoder).

6. **Scaled attention**: Imagine you're sharing a pie with your friends. If you divide the pie evenly among everyone, each person only gets a small piece. But if you give a bigger piece to your best friend, you would have to make the other pieces smaller. Similarly, in scaled attention, we divide the attention "pie" among all the words in a sentence, but we give bigger "pieces" to the more important words.

The parts with `???` are placeholders for information that should be filled in, based on the specific context or model that is being discussed.

##### Can you explain Self-attention and cross-attention

Sure, let's break those down:

**Self-attention:**

Self-attention, also sometimes called intra-attention, is a mechanism that allows each token in an input sequence to look at the other tokens in the same sequence to get better context and thereby improve the representation of the sequence. 

Here's a simplified explanation: Imagine you're reading a book and you come across a sentence that says, "Even though he was very tired, John decided to continue reading the book because it was so interesting." Now, if you want to understand who "he" is referring to in this sentence, you'll look at the other parts of the sentence, and realize that "he" is referring to "John". This is similar to what self-attention does - it allows a token to look at the other tokens in the sequence to get context and understand better.

**Cross-attention:**

Cross-attention, on the other hand, is a mechanism that allows each token in an input sequence to interact with tokens in a separate sequence. This is often used in tasks that involve pairs of sentences, like question answering or machine translation, where the model needs to understand the relationship between two different sequences.

Let's take another simplified example: Imagine you're translating a sentence from English to Spanish. You come across the English word "book", which can be translated into Spanish as "libro". However, the exact translation might depend on the other words in the English sentence and their Spanish counterparts. So, you need to look at both the English sentence and the Spanish sentence you're constructing to decide on the best translation. This is similar to what cross-attention does - it allows a token in one sequence to look at the tokens in a different sequence to get additional context and improve the translation.

In summary, self-attention and cross-attention are mechanisms that allow tokens to interact with each other to get better context and improve the model's understanding of the data. The difference between them lies in whether the tokens are looking at other tokens in the same sequence (self-attention) or a different sequence (cross-attention).

![Markdown Logo is here.](https://www.fullstackpython.com/img/logos/markdown.png)

#### -- KEY INSIGHTS --

##### Building a GPT from Scratch - Guide with Code & Notebooks

###### Technical aspects of GPT and Transformer architecture

- 🧠 The Transformer architecture, originally proposed for machine translation,  
ended up taking over the rest of AI in the next five years after its publication.
- 🤖 The Transformer neural network can model patterns in data, such as  
Shakespeare's works, and generate infinite sequences that look like  
Shakespeare's language.
- 💻 When training a Transformer, chunks of data are sampled and trained on  
individually to make predictions at every position, allowing the network to be  
used to seeing `contexts` from as little as `one` all the way up `to block size`
- 🧠 The `negative log likelihood loss`, also known as `cross entropy`, is a  
good way to measure the quality of predictions in language modeling.
- 🧐 GPT's approach of only allowing information to flow from previous context  
to the current timestamp means that it cannot get any information from the  
future, which is necessary for predicting the future accurately. (tri-mask 📐)
- 🧐 Understanding self-attention is the most important part of building GPT  
from scratch.
- 💡 ==**Self-attention**== solves the problem of gathering information from the  
past in a data-dependent way, allowing tokens to interact and learn more about  
specific tokens in the sequence.
- 🤯 The self-attention mechanism in GPT involves parallel and independent  
production of keys and queries for all tokens, followed by dot product to  
generate affinities and weighted aggregation in a data-dependent manner.
- 🤖 Scaling up the neural net with larger `batch size`, `block size`,  
`embedding dimension`, and `more layers` can significantly improve its  
`performance`, but it requires a powerful `GPU` for training.
- 🤖 The size of GPT-3's largest Transformer, with 175 billion parameters, is a  
massive infrastructure challenge to train, requiring thousands of GPUs to talk  
to each other.
- Implications and potential impact of AI-powered natural language interfaces
    - 💬 GPT can generate humorous and unexpected responses to prompts, such as  
    explaining HTML to a dog or writing release notes for chess 2.
- 🤖 The process of converting logits to probabilities via softmax and sampling  
from those probabilities is crucial in generating predictions for what comes  
next in AI-powered natural language interfaces.
- 🤖 The use of AI-powered natural language interfaces could potentially change  
the paradigm of programming, allowing ordinary people to get complex tasks done  
without needing programming expertise.

https://eightify.app/summary/data-science-and-ai/create-your-own-gpt-step-by-step-guide-with-code-notebooks

Explain the difference between numpy and panda
***

`numpy` and `pandas` are both popular Python libraries used in data analysis and  
manipulation, but they serve different purposes and have different strengths.

**NumPy (Numerical Python):**

- NumPy is a fundamental library for numerical computing in Python. It provides  
support for arrays, matrices, and mathematical functions that operate on these  
data structures.
- The core of NumPy is the `ndarray` (n-dimensional array) object, which is a  
multi-dimensional, homogeneous, and fixed-size array.
- NumPy is designed for efficient numerical computations, and it provides a wide  
range of mathematical functions such as linear algebra, statistical operations,  
and element-wise operations.
- `NumPy` is the `foundation for` many other scientific and data analysis libraries  
in Python, including **`pandas`**, SciPy, and scikit-learn.

**pandas (Python Data Analysis Library):**

- `pandas` is a library built on top of NumPy that provides data manipulation and  
analysis tools specifically designed for working with structured data, such as  
**`tables`** or **`time series`**.
- The main data structures in pandas are the `Series` (1-dimensional) and  
`DataFrame` (2-dimensional) objects. These data structures are built on top of  
NumPy arrays but provide additional functionality for handling missing data,  
data alignment, and working with heterogeneous data types.
- `pandas is well-suited for` handling real-world datasets, as it provides  
powerful tools for **`reading and writing data`** in various formats such as :
    - CSV   
    - Excel     
    - JSON  
    - SQL databases  
- pandas includes many functions for data cleaning, transformation, aggregation,  
and visualization, making it a popular choice for data preprocessing and  
exploratory data analysis.

In summary, NumPy is a foundational library for numerical computing that focuses  
on providing efficient array and matrix operations, while pandas is a  
higher-level library specifically designed for data manipulation and analysis of  
structured data.  

While both libraries are essential in the data processing pipeline, you would  
typically use NumPy for lower-level mathematical operations and pandas for  
handling real-world datasets and performing data cleaning and analysis tasks.

What is the difference between .view() and .reshape()
***

In the context of NumPy, both `.view()` and `.reshape()` are used to change the  
shape of an array. However, they operate in different ways:

**view()**

`.view()` returns a new array object that shares the same data with the original  
array. It can also be used to change the data type of the array, which is not  
possible with `.reshape()`. However, the main difference is that `.view()` will  
not guarantee contiguous memory layout, meaning that it can return a view of the  
original array even if the new shape would require a different memory layout.

Here's an example:

```python
import numpy as np

arr = np.array([1, 2, 3, 4])

new_arr = arr.view(dtype=np.float64)
```

In this case, `new_arr` will have the same data as `arr`, but the data type will  
be changed to `float64`.

**reshape()**

`.reshape()` returns a new array object with a modified shape. Unlike `.view()`,  
it only changes the shape of the array, without affecting the data type. If it's  
possible to create the new shape without changing the memory layout, `.reshape()`  
will return a view of the original array. Otherwise, it will return a copy of  
the original array with the new shape.

Here's an example:

```python
import numpy as np

arr = np.array([1, 2, 3, 4, 5, 6])

new_arr = arr.reshape((2, 3))
```

In this case, `new_arr` will be a 2x3 array containing the same data as `arr`.

In summary, `.view()` can change both the shape and the data type of an array  
while sharing the same data, while `.reshape()` only changes the shape of the  
array. However, while both methods can return views of the original array,  
`.reshape()` might return a copy if the new shape requires a different memory  
layout.

Describe the context where I would use .view() vs .reshape()
***

Here are the contexts in which you would choose to use `.view()` or `.reshape()`:

**Use `.view()` when:**

1. You want to change the memory layout or data type of the array without making  
a copy of the data. For example, if you have an array of `int32` values, and you  
want to view them as `float64` values, you can use `.view()` to achieve this  
without copying the data.
   
   ````python
   import numpy as np
   
   arr = np.array([1, 2, 3, 4], dtype=np.int32)
   float_view = arr.view(dtype=np.float64)
   ```

2. You want to create a new array that shares the same data with the original  
array, but you do not need to guarantee that the new array will have a contiguous  
memory layout. This is useful when you want to modify the new array without  
affecting the original array or when you want to work with different views of  
the same data.

   ````python
   import numpy as np
   
   arr = np.array([[1, 2], [3, 4]])
   transposed_view = arr.T.view()
   ```

**Use `.reshape()` when:**

1. You want to change the shape of an array while keeping its data type unchanged.  
This is useful when you want to transform a 1D array into a 2D array, for example,  
or when you want to change the dimensions of a multi-dimensional array.

   ````python
   import numpy as np
   
   arr = np.array([1, 2, 3, 4, 5, 6])
   reshaped_arr = arr.reshape((2, 3))
   ```
  
2. You want to create a new array with a different shape, and you want to ensure  
that the new array has a contiguous memory layout. In cases where the desired shape  
cannot be created with a contiguous layout, `.reshape()` will return a copy of the  
original array with the new shape.

   ````python
   import numpy as np
   
   arr = np.array([[1, 2], [3, 4]])
   flattened_arr = arr.reshape(-1)
   ```

In summary, use `.view()` when you want to change the memory layout or data type  
without copying the data or when you do not need to guarantee contiguous memory  
layout. Use `.reshape()` when you want to change the shape of an array while  
keeping its data type the same and ensuring a contiguous memory layout if possible.

Walk me through Tensor reshape
***

Tensor reshaping is the process of changing the shape of a tensor without  
altering its underlying data. This is useful when you need to change the  
dimensions of a tensor for certain operations or when you need to rearrange the  
data to a different shape for further processing.

Here's a step-by-step guide to reshaping tensors using the `reshape` method in  
PyTorch:

1. **Import the necessary library**

   First, import the PyTorch library, which provides the `torch` module for  
   working with tensors:

   ````python
   import torch
   ```

2. **Create a tensor**

   Create a tensor with the initial shape. For example, let's create a 1D tensor  
   with 12 elements:

   ````python
   initial_tensor = torch.arange(12)
   print(initial_tensor)
   # Output: tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
   ```

3. **Reshape the tensor**

   Now, use the `reshape` method to change the shape of the tensor. For  
   instance, let's reshape the 1D tensor into a 3x4 matrix:

   ````python
   reshaped_tensor = initial_tensor.reshape(3, 4)
   print(reshaped_tensor)
   # Output:
   # tensor([[ 0,  1,  2,  3],
   #         [ 4,  5,  6,  7],
   #         [ 8,  9, 10, 11]])
   ```

   The `reshape` method takes the desired dimensions as arguments and returns a  
   new tensor with the specified shape. Make sure that the product of the new  
   dimensions is equal to the total number of elements in the initial tensor.  
   In this case, 3 * 4 = 12.

4. **Using `-1` as a dimension**

   You can use `-1` as one of the dimensions in the `reshape` method to  
   automatically infer the size of that dimension based on the size of the other  
   dimensions and the total number of elements in the tensor. For example:

   ````python
   reshaped_tensor = initial_tensor.reshape(3, -1)
   print(reshaped_tensor)
   # Output:
   # tensor([[ 0,  1,  2,  3],
   #         [ 4,  5,  6,  7],
   #         [ 8,  9, 10, 11]])
   ```

   In this case, the `-1` is replaced with 4, as it's the only value that  
   ensures the product of dimensions is equal to the total number of elements (12).

That's it! With these steps, you can reshape tensors in PyTorch to fit your needs.  
Remember that reshaping doesn't modify the original tensor's data, only its shape.

In [36]:
import numpy as np
arr = np.array([1, 2, 3, 4, 5, 6])
reshaped_arr = arr.reshape((2, 3))
print(f"{arr} \n{reshaped_arr}")
reshaped_arr.squeeze()

[1 2 3 4 5 6] 
[[1 2 3]
 [4 5 6]]


array([[1, 2, 3],
       [4, 5, 6]])

In [18]:
import torch
initial_tensor = torch.arange(12)
print(initial_tensor)
reshaped_tensor = initial_tensor.reshape(3, -1)
print(reshaped_tensor)
transposed_tensor = reshaped_tensor.transpose(0, 1)
print(transposed_tensor)
squeezed_tensor = transposed_tensor.squeeze()
print(squeezed_tensor)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])
tensor([[ 0,  4,  8],
        [ 1,  5,  9],
        [ 2,  6, 10],
        [ 3,  7, 11]])
tensor([[ 0,  4,  8],
        [ 1,  5,  9],
        [ 2,  6, 10],
        [ 3,  7, 11]])


In [23]:
logits = torch.tensor([[5.0, 6.0, 7.0, 8.0], [1.1, 2.1, 3.1, 4.1]])
logits

tensor([[5.0000, 6.0000, 7.0000, 8.0000],
        [1.1000, 2.1000, 3.1000, 4.1000]])

In [24]:
probs = F.softmax(logits, dim=1)
probs

tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]])

In [25]:
random_logit = torch.rand(2, 4)
random_logit

tensor([[0.0560, 0.4175, 0.3114, 0.9418],
        [0.9193, 0.2046, 0.6022, 0.1689]])

In [32]:
random_logit = torch.tensor([
    [0.0560, 0.4175, 0.3114, 0.9418],
    [0.9193, 0.1046, 0.0022, 0.0089]])
random_logit

tensor([[0.0560, 0.4175, 0.3114, 0.9418],
        [0.9193, 0.1046, 0.0022, 0.0089]])

In [33]:
random_probs = F.softmax(random_logit, dim=1)
random_probs

tensor([[0.1626, 0.2334, 0.2099, 0.3942],
        [0.4455, 0.1972, 0.1780, 0.1792]])