In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/
!mkdir -p 11868
%cd /content/drive/MyDrive/11868
!git clone https://github.com/llmsystem/llmsys_code_examples.git
%cd /content/drive/MyDrive/11868/llmsys_code_examples/tensor_demo/miniTorch

## MiniTorch Installation

In [None]:
!pip install -r requirements.txt

Collecting colorama==0.4.3 (from -r requirements.txt (line 1))
  Downloading colorama-0.4.3-py2.py3-none-any.whl (15 kB)
Collecting hypothesis==6.54 (from -r requirements.txt (line 2))
  Downloading hypothesis-6.54.0-py3-none-any.whl (389 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mypy==0.971 (from -r requirements.txt (line 3))
  Downloading mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting pre-commit==2.20.0 (from -r requirements.txt (line 6))
  Downloading pre_commit-2.20.0-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.5/199.5 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytest==7.1.2 (from -r requirements.txt (line 7)

In [None]:
!pip install -r requirements.extra.txt

Collecting datasets==2.4.0 (from -r requirements.extra.txt (line 1))
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting embeddings==0.0.8 (from -r requirements.extra.txt (line 2))
  Downloading embeddings-0.0.8-py3-none-any.whl (12 kB)
Collecting networkx==2.4 (from -r requirements.extra.txt (line 3))
  Downloading networkx-2.4-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting plotly==4.14.3 (from -r requirements.extra.txt (line 4))
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydot==1.4.1 (from -r requirements.extra.txt (line 5))
  Downloading pydot-1.4.1-py2.py3-none-any.whl (19 kB)
C

In [None]:
!pip install -Ue .

Obtaining file:///content/drive/MyDrive/11868/assignment1/miniTorch
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: minitorch
  Running setup.py develop for minitorch
Successfully installed minitorch-0.4


In [None]:
!bash compile_cuda.sh

In [None]:
import minitorch

## Go Through Minitorch

This is a high level abstraction of some basic components in MiniTorch.

<img src="./imgs/high_level_abstraction.png"></img>

In [None]:
class Parameter:
    """
    A Parameter is a special container stored in a `Module`.

    It is designed to hold a `Variable`, but we allow it to hold
    any value for testing.
    """

    def __init__(self, x: Any, name: Optional[str] = None) -> None:
        self.value = x
        self.name = name
        if hasattr(x, "requires_grad_"):
            self.value.requires_grad_(True)
            if self.name:
                self.value.name = self.name
    
    def update(self, x: Any) -> None:
        "Update the parameter value."

In [None]:
class Module:
    """
    Modules form a tree that store parameters and other
    submodules. They make up the basis of neural network stacks.

    Attributes:
        _modules : Storage of the child modules
        _parameters : Storage of the module's parameters
        training : Whether the module is in training mode or evaluation mode

    """

    _modules: Dict[str, Module]
    _parameters: Dict[str, Parameter]
    training: bool

    def parameters(self) -> Sequence[Parameter]:
        "Enumerate over all the parameters of this module and its descendents."
        return [j for _, j in self.named_parameters()]
    
    def __call__(self, *args: Any, **kwargs: Any) -> Any:
        return self.forward(*args, **kwargs)

    def train(self) -> None:
        "Set the mode of this module and all descendent modules to `train`."
        for m in self.modules():
            m.train()
        self.training = True

    def eval(self) -> None:
        "Set the mode of this module and all descendent modules to `eval`."
        for m in self.modules():
            m.eval()
        self.training = False

In [None]:
# Initialization
def RParam(*shape):
    r = 0.1 * (minitorch.rand(shape, backend=BACKEND) - 0.5)
    return minitorch.Parameter(r)

In [None]:
class Optimizer:
    def __init__(self, parameters: Sequence[Parameter]):
        self.parameters = parameters
    
    def zero_grad(self) -> None:
        for p in self.parameters:
            if p.value is None:
                continue
            if hasattr(p.value, "derivative"):
                if p.value.derivative is not None:
                    p.value.derivative = None
            if hasattr(p.value, "grad"):
                if p.value.grad is not None:
                    p.value.grad = None

    def _print(self) -> None:
        for param in self.parameters:
            if param.value is None:
                continue
            print(param.value.shape)
            print(param.value.grad)


class SGD(Optimizer):
    def __init__(self, parameters: Sequence[Parameter], lr: float = 1.0):
        super().__init__(parameters)
        self.lr = lr

    def step(self) -> None:
        for p in self.parameters:
            if p.value is None:
                continue
            elif hasattr(p.value, "grad"):
                if p.value.grad is not None:
                    p.update(p.value - self.lr * p.value.grad)

In [None]:
class Function:
    @classmethod
    def _backward(cls, ctx: Context, grad_out: Tensor) -> Tuple[Tensor, ...]:
        return wrap_tuple(cls.backward(ctx, grad_out))  # type: ignore

    @classmethod
    def _forward(cls, ctx: Context, *inps: Tensor) -> Tensor:
        return cls.forward(ctx, *inps)  # type: ignore

    @classmethod
    def apply(cls, *vals: Tensor) -> Tensor:
        raw_vals = []
        need_grad = False
        for v in vals:
            if v.requires_grad():
                need_grad = True
            raw_vals.append(v.detach())

        # Create the context.
        ctx = Context(not need_grad)

        # Call forward with the variables.
        c = cls._forward(ctx, *raw_vals)

        # Create a new variable from the result with a new history.
        back = None
        if need_grad:
            back = minitorch.History(cls, ctx, vals)
        return minitorch.Tensor(c._tensor, back, backend=c.backend)

In [None]:
class Add(Function):
    @staticmethod
    def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
        return t1.f.add_zip(t1, t2)

    @staticmethod
    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
        return grad_output, grad_output

In [None]:
class Tensor:
    """
    Tensor is a generalization of Scalar in that it is a Variable that
    handles multidimensional arrays.
    """

    backend: TensorBackend
    history: Optional[History]
    grad: Optional[Tensor]
    _tensor: TensorData
    unique_id: int
    name: str

## Data Storage and Operators

This diagram shows several important components of `class Tensor`, including data storage `class TensorData`, backend `class TensorBackend`, etc. Backend relies on `class TensorOps` to execute the actual computation for different operators. Basic functions are implemented and abstracted by `class Function`.

<img src="./imgs/data_storage_operators.png"></img>

In [None]:
import numpy.typing as npt
from typing import Sequence
from typing_extensions import TypeAlias

Storage: TypeAlias = npt.NDArray[np.float64]
OutIndex: TypeAlias = npt.NDArray[np.int32]
Index: TypeAlias = npt.NDArray[np.int32]
Shape: TypeAlias = npt.NDArray[np.int32]
Strides: TypeAlias = npt.NDArray[np.int32]

UserIndex: TypeAlias = Sequence[int]
UserShape: TypeAlias = Sequence[int]
UserStrides: TypeAlias = Sequence[int]

In [None]:
class TensorData:
    _storage: Storage
    _strides: Strides
    _shape: Shape
    strides: UserStrides
    shape: UserShape
    dims: int

### Indexing

<img src="./imgs/strides.png"></img>

In [None]:
x = minitorch.tensor([1, 2, 3, 4, 5, 6])

In [None]:
x.to_numpy()

In [None]:
x._tensor.shape, x._tensor.strides

In [None]:
y = minitorch.Tensor.make(
    storage=x._tensor._storage, 
    shape=(2, 3), 
    strides=(3, 1),
    backend=x.backend)
y

In [None]:
y._tensor._storage

In [None]:
y._tensor._storage is x._tensor._storage

In [None]:
z = x.view(3, 2, 1)
z

In [None]:
z._tensor.shape, y._tensor.strides

In [None]:
z._tensor._storage is x._tensor._storage

In [None]:
z_index = [1, 1, 0]
pos = minitorch.index_to_position(z_index, z._tensor._strides)
z[tuple(z_index)] == z._tensor._storage[pos], pos, z[tuple(z_index)]

In [None]:
out_index = [0, 0, 0]
minitorch.to_index(3, z.shape, out_index)
out_index

In [None]:
shape = (2, 3)
minitorch.strides_from_shape(shape)

In [None]:
shape = (3, 2, 1)
minitorch.strides_from_shape(shape)

In [None]:
p = z.permute(2, 1, 0)
p

In [None]:
p._tensor.shape, p._tensor.strides

In [None]:
p._tensor._storage is x._tensor._storage

In [None]:
p_index = [0, 0, 0]
minitorch.to_index(-1, p.shape, p_index)
p_index

In [None]:
minitorch.index_to_position(p_index, p._tensor._strides)

### Backend & Operators

In [None]:
class TensorOps:
    @staticmethod
    def map(fn: Callable[[float], float]) -> MapProto:
        pass

    @staticmethod
    def zip(fn: Callable[[float, float], float]) -> Callable[[Tensor, Tensor], Tensor]:
        pass

    @staticmethod
    def reduce(
        fn: Callable[[float, float], float], start: float = 0.0
    ) -> Callable[[Tensor, int], Tensor]:
        pass

    @staticmethod
    def matrix_multiply(a: Tensor, b: Tensor) -> Tensor:
        raise NotImplementedError("Not implemented in this assignment")


In [None]:
class TensorBackend:
    def __init__(self, ops: Type[TensorOps]):
        # Maps
        self.neg_map = ops.map(operators.neg)
        self.sigmoid_map = ops.map(operators.sigmoid)
        self.relu_map = ops.map(operators.relu)
        self.log_map = ops.map(operators.log)
        self.exp_map = ops.map(operators.exp)
        self.id_map = ops.map(operators.id)
        self.id_cmap = ops.cmap(operators.id)
        self.inv_map = ops.map(operators.inv)

        # Zips
        self.add_zip = ops.zip(operators.add)
        self.mul_zip = ops.zip(operators.mul)
        self.lt_zip = ops.zip(operators.lt)
        self.eq_zip = ops.zip(operators.eq)
        self.is_close_zip = ops.zip(operators.is_close)
        self.relu_back_zip = ops.zip(operators.relu_back)
        self.log_back_zip = ops.zip(operators.log_back)
        self.inv_back_zip = ops.zip(operators.inv_back)

        # Reduce
        self.add_reduce = ops.reduce(operators.add, 0.0)
        self.mul_reduce = ops.reduce(operators.mul, 1.0)

        # Matrix Multiply
        self.matrix_multiply = ops.matrix_multiply


In [None]:
class CudaKernelOps(TensorOps):
    @staticmethod
    def map(fn: Callable[[float], float]) -> MapProto:
        ### Your Implementation ###

    @staticmethod
    def zip(fn: Callable[[float, float], float]) -> Callable[[Tensor, Tensor], Tensor]:
        ### Your Implementation ###

    @staticmethod
    def reduce(
        fn: Callable[[float, float], float], start: float = 0.0
    ) -> Callable[[Tensor, int], Tensor]:
        ### Your Implementation ###

    @staticmethod
    def matrix_multiply(a: Tensor, b: Tensor) -> Tensor:
        ### Your Implementation ###

In [None]:
# In project/run_sentiment.py
backend_name = "CudaKernelOps"

if backend_name == "CudaKernelOps":
    from minitorch.cuda_kernel_ops import CudaKernelOps
    BACKEND = minitorch.TensorBackend(CudaKernelOps)

### Connect to CUDA Kernels

In [None]:
def map(fn: Callable[[float], float]) -> MapProto:
    "See `tensor_ops.py`"
    fn_id = fn_map[fn]

    def ret(a: Tensor, out: Optional[Tensor] = None) -> Tensor:
        if out is None:
            out = a.zeros(a.shape)

        # Define the argument type for the tensorMap function
        lib.tensorMap.argtypes = [
            ctypes.POINTER(ctypes.c_double),  # out
            ctypes.POINTER(ctypes.c_int),    # out_shape
            ctypes.POINTER(ctypes.c_int),    # out_strides
            ctypes.c_int,                    # out_size
            ctypes.POINTER(ctypes.c_double),  # in_storage
            ctypes.POINTER(ctypes.c_int),    # in_shape
            ctypes.POINTER(ctypes.c_int),    # in_strides
            ctypes.c_int,                    # shape_len
            ctypes.c_int,                    # fn_id
        ]

        # Define the return type for the tensorMap function
        lib.tensorMap.restype = None

        # Convert the numpy arrays to gpuarrays that can be loaded to the gpu
        out_array_gpu = gpuarray.to_gpu(out._tensor._storage)
        out_shape_gpu = gpuarray.to_gpu(out._tensor._shape.astype(np.int32))
        out_strides_gpu = gpuarray.to_gpu(out._tensor._strides.astype(np.int32))
        in_array_gpu = gpuarray.to_gpu(a._tensor._storage)
        in_shape_gpu = gpuarray.to_gpu(a._tensor._shape.astype(np.int32))
        in_strides_gpu = gpuarray.to_gpu(a._tensor._strides.astype(np.int32))


        # Call the function
        lib.tensorMap(
            ctypes.cast(out_array_gpu.ptr, ctypes.POINTER(ctypes.c_double)),
            ctypes.cast(out_shape_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
            ctypes.cast(out_strides_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
            ctypes.c_int(out.size),
            ctypes.cast(in_array_gpu.ptr, ctypes.POINTER(ctypes.c_double)),
            ctypes.cast(in_shape_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
            ctypes.cast(in_strides_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
            ctypes.c_int(len(a.shape)),
            ctypes.c_int(fn_id)
        )
            
        # Copy the gpuarray back to the cpu
        out._tensor._storage = out_array_gpu.get()

        # Free the gpuarrays
        out_array_gpu.gpudata.free()
        out_shape_gpu.gpudata.free()
        out_strides_gpu.gpudata.free()
        in_array_gpu.gpudata.free()
        in_shape_gpu.gpudata.free()
        in_strides_gpu.gpudata.free()
        return out

    return ret

In [None]:
void tensorMap(
    scalar_t* out, 
    int* out_shape, 
    int* out_strides, 
    int out_size, 
    scalar_t* in_storage, 
    int* in_shape, 
    int* in_strides,
    int shape_size,
    int fn_id
) {
    int threadsPerBlock = BASE_THREAD_NUM;
    int blocksPerGrid = (out_size + threadsPerBlock - 1) / threadsPerBlock;
    mapKernel<<<blocksPerGrid, threadsPerBlock>>>(out, out_shape, out_strides, out_size, in_storage, in_shape, in_strides, shape_size, fn_id);
    cudaDeviceSynchronize();
}