


## 1. Độ Sai số (Error) & Độ Tương đồng (Similarity)

Công thức bạn cung cấp là một biến thể của **sai số tương đối**. Nó không dùng để so sánh CUDA và PyTorch, mà thường được dùng trong học máy để **tính toán hàm mất mát (loss function)** hoặc đánh giá hiệu năng của một mô hình.

### Công thức toán học

Công thức tính độ sai số được biểu diễn như sau:

$$
\text{Error} = \frac{\left| a - b \right|}{|a| + \epsilon}
$$
$$
\text{Similarity} = \frac{\left| a + b \right|}{|2a| + \epsilon}
$$

Trong đó:
* `a`: Giá trị gốc, giá trị tham chiếu (ví dụ: nhãn đúng của dữ liệu).
* `b`: Giá trị dự đoán hoặc giá trị mới (ví dụ: đầu ra của mô hình).
* `|a - b|`: Giá trị tuyệt đối của hiệu số giữa `a` và `b`.
* `ε` (epsilon): Là một hằng số rất nhỏ (ví dụ: $10^{-8}$) được thêm vào mẫu số để tránh trường hợp chia cho không khi $a = 0$.



In [None]:
import os
import random
import numpy as np
import torch
from torchvision import datasets, transforms
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from numba import jit
import numpy as np
from numba import cuda, float32
import time
import torch.nn as nn
import math
print('torch', torch.__version__)
print('numpy', np.__version__)
print('matplotlib', plt.matplotlib.__version__)

# Thiết lập phát sinh ngẫu nhiên có thể tái lập
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
import math
def run_softmax_cuda(x_np):
    B, C, H, W = x_np.shape
    @cuda.jit(cache=True)
    def softmax_Cuda(inp, out):
        # tọa độ output
        b_idx = cuda.blockIdx.x  # mỗi block xử lý 1 sample trong batch
        idx   = cuda.threadIdx.x   # mỗi thread xử lý 1 trong 10 phần tử
        temp = cuda.shared.array(10, dtype = np.float32)
        temp[idx] = inp[b_idx, idx, 0, 0]
        cuda.syncthreads()
        sum_exp = 0.0
        for i in range(10):
            sum_exp +=  math.exp(temp[i])
        cuda.syncthreads() 
        out[b_idx, idx] = math.exp(temp[idx]) / sum_exp
    d_out = cuda.device_array((B, C), dtype = np.float32)
    bpg_x = 10
    bpg_z = B
    softmax_Cuda[B, 10](
        x_np, d_out
    )
    cuda.synchronize()
    return d_out

In [None]:
def run_softmax_backward(output,Y):
    B,C = output.shape
    @cuda.jit(cache=True) 
    def softmax_backward(inp,y,out):
        b = cuda.blockIdx.x
        c = cuda.threadIdx.x
        out[b,c,0,0]=(inp[b,c]-y[b,c])*1/B
    d_out = cuda.device_array((B,C,1,1),dtype = np.float32)
    softmax_backward[B,C](output,Y,d_out)
    cuda.synchronize()
    return d_out

In [None]:
def run_global_average_Cuda(x,TPB=32):
    B, C, H, W = x.shape
    @cuda.jit(cache=True)
    def Global_average_Cuda(inp, out):
        x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        if x < B*C:
            b = x // C
            c = x % C
            acc = float32(0.0)
            for i in range(H):
                for j in range(W):
                    acc+=inp[b,c,i,j]
            out[b,c,0,0] = acc/(H*W)
    griddim = (B * C + TPB - 1)//TPB
    d_out = cuda.device_array((B, C, 1, 1), dtype= np.float32)
    Global_average_Cuda[griddim, TPB](
        x, d_out
    )
    cuda.synchronize()

    return d_out

In [None]:
def run_avgpool_backward(inp,TPB=16):
    B,C,_,_ = inp.shape
    H_out,W_out = 13,13
    @cuda.jit(cache=True)
    def avgpool_backward(inp_1,out):
        z = cuda.blockIdx.z
        b = z // C
        c = z % C
        x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        if x < H_out and y < W_out:
           out[b,c,y,x] = inp_1[b,c,0,0]/(H_out*W_out) 
    d_out = cuda.device_array((B,C,H_out,W_out), dtype = np.float32)
    bpx = (H_out + TPB -1)//TPB
    bpy = (W_out + TPB -1)//TPB
    bpz = B*C
    grid_dim=(bpx,bpy,bpz)
    block_dim=(TPB,TPB,1)
    avgpool_backward[grid_dim,block_dim](inp,d_out)
    cuda.synchronize()
    return d_out

In [None]:
def run_conv2d_cuda_shared_2(x_conv, w_conv, b_conv, P=2, S=2, TPB=16):
    B, C, H, W = x_conv.shape
    F, _, K, _ = w_conv.shape

    # Tính output size
    H_pad = H + 2*P
    W_pad = W + 2*P
    H_out = (H_pad - K)//S + 1
    W_out = (W_pad - K)//S + 1

    # Tính tile size cho shared memory
    tile_h = (TPB - 1) * S + K
    tile_w = tile_h
    @cuda.jit(cache=True)
    def conv2d_cuda_shared(inp, filters, bias, out):
        # Shared memory 3D: [C, tile_h, tile_w]
        shmem = cuda.shared.array((C, tile_h, tile_w), np.float32)
    
        # Tách batch và filter
        z = cuda.blockIdx.z
        b = z // F
        f = z % F
    
        # Tọa độ output
        x_out = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        y_out = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    
        # Góc trên–trái của tile trên inp đã pad
        x0 = cuda.blockIdx.x * cuda.blockDim.x * S
        y0 = cuda.blockIdx.y * cuda.blockDim.y * S
        for c in range(C):
            for ly in range(cuda.threadIdx.y, tile_h, cuda.blockDim.y):
                gy = y0 + ly-P
                for lx in range(cuda.threadIdx.x, tile_w, cuda.blockDim.x):
                    gx = x0 + lx-P
                    # nếu ngoài biên input thì gán 0, còn lại đọc inp
                    if 0 <= gy < H and 0 <= gx < W:
                        shmem[c, ly, lx] = inp[b, c, gy, gx]
                    else:
                        shmem[c, ly, lx] = np.float32(0.0)
        cuda.syncthreads()
    
        # 2) Tính convolution nếu trong vùng output
        if b < B and f < F and y_out < H_out and x_out < W_out:
            acc = np.float32(0.0)
            for c in range(C):
                for ky in range(K):
                    for kx in range(K):
                        sy = cuda.threadIdx.y * S + ky
                        sx = cuda.threadIdx.x * S + kx
                        acc += shmem[c, sy, sx] * filters[f, c, ky, kx]
            acc += bias[f]
            out[b, f, y_out, x_out] = acc
    d_out = cuda.device_array((B, F, H_out, W_out), dtype = np.float32)
    # grid & block dims
    bpg_x = (W_out + TPB - 1) // TPB
    bpg_y = (H_out + TPB - 1) // TPB
    bpg_z = B * F
    grid_dims  = (bpg_x, bpg_y, bpg_z)
    block_dims = (TPB, TPB, 1)
    # start = time.time()
    conv2d_cuda_shared[grid_dims, block_dims](x_conv, w_conv, b_conv, d_out)
    cuda.synchronize()
    # time_cu_shared = time.time() - start

    return d_out

In [None]:

def run_conv2d_cuda(x, w, b, P=2, S=2, TPB=16):
    B, C, H, W = x.shape
    F, _, KH, KW = w.shape
    
    H_out = (H + 2*P - KH)//S + 1
    W_out = (W + 2*P - KW)//S + 1
    
    @cuda.jit(cache=True)
    def conv2d_cuda_kernel(input_arr, filters, bias, output):
        z = cuda.blockIdx.z
        b = z // F
        f = z % F
        
        x_out = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        y_out = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        if x_out < W_out and y_out < H_out and b < B and f < F:
            output[b, f, y_out, x_out] = 0.0
            # FIXED: Loop channels first, then kernel positions
            for c in range(C):
                for kh in range(KH):
                    for kw in range(KW):
                        # Calculate input coordinates
                        y_in = y_out * S + kh - P
                        x_in = x_out * S + kw - P
                        
                        # Check bounds for each kernel position
                        if 0 <= y_in < H and 0 <= x_in < W:
                            output[b, f, y_out, x_out] += input_arr[b, c, y_in, x_in] * filters[f, c, kh, kw]
                        # If out of bounds, contribution is 0 (padding effect)
            
            output[b, f, y_out, x_out] += bias[f]
    
    d_out = cuda.device_array((B, F, H_out, W_out), dtype = np.float32)
    bpg_x = (W_out + TPB - 1) // TPB
    bpg_y = (H_out + TPB - 1) // TPB
    bpg_z = B * F
    
    grid_dims = (bpg_x, bpg_y, bpg_z)
    block_dims = (TPB, TPB, 1)
    
    conv2d_cuda_kernel[grid_dims, block_dims](x, w, b, d_out)
    cuda.synchronize()
    
    return d_out

In [None]:
def run_filter_conv_backward(X, F, LO,V,M,step, S=1, P=1,TPB=256):
    B, C, H, W = X.shape
    B_f, C_f, H_f, W_f = F.shape    
    @cuda.jit(cache=True)
    def filter_conv_backward(inp,fil, lo,V,M,step,out):
        # Mỗi thread xử lý nhiều phần tử để tận dụng tốt hơn
        tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        total_elements = B_f * C_f * H_f * W_f
        
        if tid >= total_elements:
            return
            
        # Chuyển 1D index thành 4D coordinates
        w_idx = tid % W_f
        tid //= W_f
        h_idx = tid % H_f
        tid //= H_f
        c_idx = tid % C_f
        b_idx = tid // C_f
        
        # Tính gradient cho filter[b_idx, c_idx, h_idx, w_idx]
        temp_sum = 0.0
        for k in range(B):  # batch của input
            for i in range(lo.shape[2]):  # height của output gradient
                for j in range(lo.shape[3]):  # width của output gradient
                    # Tính vị trí tương ứng trong input
                    sy = i * S - P + h_idx
                    sx = j * S - P + w_idx
                    
                    # Chỉ tính nếu vị trí hợp lệ
                    if 0 <= sy < H and 0 <= sx < W:
                        temp_sum += lo[k, b_idx, i, j] * inp[k, c_idx, sy, sx]
        M[b_idx, c_idx, h_idx, w_idx] = 0.9 * V[b_idx, c_idx, h_idx, w_idx] + (1-0.9)*temp_sum               
        V[b_idx, c_idx, h_idx, w_idx] = 0.999 * V[b_idx, c_idx, h_idx, w_idx] + (1-0.999)*math.pow(temp_sum,2)
        m_train = M[b_idx, c_idx, h_idx, w_idx]/(1 - math.pow(0.9,step))
        v_train = V[b_idx, c_idx, h_idx, w_idx]/(1 - math.pow(0.999,step))
        out[b_idx, c_idx, h_idx, w_idx] = fil[b_idx, c_idx, h_idx, w_idx] - 0.0003 * m_train/(math.sqrt(v_train) + 1e-8)
        #out[b_idx, c_idx, h_idx, w_idx] = fil[b_idx, c_idx, h_idx, w_idx] - 0.001* temp_sum
        #out[b_idx, c_idx, h_idx, w_idx] = temp_sum
    
    # Tạo output array
    d_out = cuda.device_array((B_f, C_f, H_f, W_f), dtype = np.float32)
    
    # Tận dụng tối đa threads
    total_threads_needed = B_f * C_f * H_f * W_f
    threads_per_block = TPB  # hoặc 512 tùy GPU
    blocks_per_grid = (total_threads_needed + TPB -1) // TPB
    
    # Launch kernel
    filter_conv_backward[blocks_per_grid, threads_per_block](X,F, LO,V,M,step, d_out)
    cuda.synchronize()
    
    return d_out

In [None]:
def run_input_conv_backward(X, F, LO, S=1, P=1, TPB=16):
    B, C, H, W = X.shape
    H_pad, W_pad = H + 2*P, W + 2*P
    K = F.shape[2]  # assuming F.shape = (C_out, C_in, K, K)
    B_f, C_f, H_f, W_f = F.shape
    
    @cuda.jit(cache=True)
    def input_conv_backward(filt, lo, out):
        x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x  # 0 -> W_pad-1
        y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y  # 0 -> H_pad-1
        z = cuda.blockIdx.z
        b = z // C
        c = z % C
        
        if x < W_pad and y < H_pad:
            out[b, c, y, x] = 0.0
            for b_f in range(B_f):  # b_f is output channel
                for i in range(K):
                    for j in range(K):
                        # Calculate source coordinates in lo (output gradient)
                        # Đây là phần quan trọng: ánh xạ từ input coordinates sang output coordinates
                        if (y +i- (K-1))%S == 0 and (x +j- (K-1))%S == 0:
                            sy = (y +i- (K-1))//S  # hoặc y - (K-1-i) tùy theo cách hiểu
                            sx = (x +j- (K-1))//S  # hoặc x - (K-1-j) tùy theo cách hiểu
                            if sy >= 0 and sy < lo.shape[2] and sx >= 0 and sx < lo.shape[3]:
                                out[b, c, y, x] += lo[b, b_f, sy, sx] * filt[b_f, c, K-1-i, K-1-j]
            
    
    d_out = cuda.device_array((B, C, H_pad, W_pad), dtype = np.float32)
    bpg_x = (W_pad + TPB - 1) // TPB
    bpg_y = (H_pad + TPB - 1) // TPB
    bpg_z = B * C
    
    grid_dims = (bpg_x, bpg_y, bpg_z)
    block_dims = (TPB, TPB, 1)
    
    input_conv_backward[grid_dims, block_dims](F, LO, d_out)
    cuda.synchronize()
    
    return d_out[:, :, P:H_pad-P, P:W_pad-P]

In [None]:
def run_bias_backward(inp,bias,V,M,step,TPB=64):
    B,C,H,W = inp.shape
    @cuda.jit(cache=True)
    def bias1x1_backward(X,bi,V,M,step,d_out):
        x = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x #10
        if x < C:
            d_out[x] =  0.0
            for b in range(B):
                for i in range(H):
                    for j in range(W):
                        d_out[x]+=X[b,x,i,j] 
            M[x] = 0.9*M[x] + (1-0.9)*d_out[x]
            V[x] = 0.999 * V[x] + (1-0.999)*math.pow(d_out[x],2)
            m_train = M[x]/(1-math.pow(0.9,step))
            v_train = V[x]/(1-math.pow(0.999,step))
            d_out[x] = bi[x] - 0.0003 * m_train/(math.sqrt(v_train) + 1e-8)
    d_out = cuda.device_array((C,),dtype = np.float32)
    blocksize = (C+TPB-1)//TPB
    blockdim  = TPB   
    bias1x1_backward[blocksize,blockdim](inp,bias,V,M,step,d_out)
    cuda.synchronize()
    return d_out

In [None]:
def run_conv_backward(X,F,B,LO,VF,VB,MF,MB,step,stride=1,padding=1,TPB=16):
    F_backward = run_filter_conv_backward(X,F,LO,VF,MF,step,stride,padding)
    X_backward = run_input_conv_backward(X,F,LO,stride,padding,TPB)
    bias_backward =  run_bias_backward(LO,B,VB,MB,step)
    return X_backward,F_backward,bias_backward

In [None]:
def run_Relu_Cuda(x_relu,TPB=16):
    B, C, H, W = x_relu.shape
    # Chuẩn bị output trên GPU
    d_out = cuda.device_array((B, C, H, W), dtype = np.float32)
    bpg_x = (H + TPB - 1) // TPB
    bpg_y = (W + TPB - 1) // TPB
    bpg_z = B * C
    
    @cuda.jit(cache=True)
    def Relu_Cuda(inp, out):
        z = cuda.blockIdx.z
        b = z // C
        c = z % C
        # tọa độ output
        x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        if y<H and x<W:
            out[b, c, y, x] = inp[b, c, y, x] if inp[b, c, y, x] > 0 else 0
    
    grid_dims  = (bpg_x, bpg_y, bpg_z)
    block_dims = (TPB, TPB, 1)
    Relu_Cuda[grid_dims, block_dims](
        x_relu, d_out
    )
    cuda.synchronize()
    return d_out

In [None]:
def run_relu_backward(X,LO,TPB=16):
    B,C,H,W = LO.shape
    @cuda.jit(cache = True)
    def relu_backward(inp,lo,output):
        z = cuda.blockIdx.z
        b = z//C
        c = z%C
        x = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y
        if x < H and y<W:
            if inp[b,c,y,x]<0:
                output[b,c,y,x]=0
            else:
                output[b,c,y,x] = lo[b,c,y,x]
    d_out = cuda.device_array((B,C,H,W),dtype = np.float32)
    bpg_x = (H + TPB - 1) // TPB
    bpg_y = (W + TPB - 1) // TPB
    bpg_z = B*C
    griddim  = (bpg_x,bpg_y,bpg_z)
    blockdim = (TPB,TPB,1)
    relu_backward[griddim,blockdim](X,LO,d_out)
    cuda.synchronize()
    return d_out

In [None]:
def run_maxpool2d_cuda(x,P=0, S=2, TPB=16):
    B, C, H, W = x.shape
    K       = 3    
    # Tính kích thước đầu ra
    H_out = (H - K)//S + 1
    W_out = (W - K)//S + 1
    @cuda.jit(cache = True)
    def maxpool2d_cuda(inp, out):
        # merge batch và channel vào blockIdx.z
        z = cuda.blockIdx.z
        b = z // C
        c = z % C
    
        # tọa độ output
        x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    
        if b < B and c < C and y < H_out and x < W_out:
            # gốc cửa sổ pooling
            base_y = y * S
            base_x = x * S
    
            # khởi tạo max từ phần tử đầu
            max_val = inp[b, c, base_y, base_x]
            # quét cửa sổ K×K
            for i in range(K):
                for j in range(K):
                    val = inp[b, c, base_y + i, base_x + j]
                    if val > max_val:
                        max_val = val
    
            out[b, c, y, x] = max_val
    # Chuẩn bị output trên GPU
    d_out = cuda.device_array((B, C, H_out, W_out), dtype = np.float32)
    
    bpg_x = (W_out + TPB - 1) // TPB
    bpg_y = (H_out + TPB - 1) // TPB
    bpg_z = B * C
    
    grid_dims  = (bpg_x, bpg_y, bpg_z)
    block_dims = (TPB, TPB, 1)
    maxpool2d_cuda[grid_dims, block_dims](
        x, d_out
    )
    cuda.synchronize()
    return d_out

In [None]:
def run_maxpooling_backward(X,LO,K=3,stride=2,TPB=16):
    B,C,H,W = X.shape
    B_lo,C_lo,H_lo,W_lo = LO.shape
    @cuda.jit(cache=True)
    def maxpooling_backward(inp,lo,out):
        x = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y
        z = cuda.blockIdx.z
        b = z//C
        c = z%C
        if x < W_lo and y < H_lo:
            basex= x*stride
            basey= y*stride
            maxval = -1e20
            for i in range(K):
                for j in range(K):
                    val = inp[b, c, basey + i, basex + j]
                    if val > maxval:
                        maxval=val
                        index_y = basey + i
                        index_x = basex + j
            cuda.atomic.add(out, (b, c, index_y, index_x), lo[b, c, y, x])           
    dout = cuda.to_device(np.zeros((B,C,H,W),dtype = np.float32))

    bpx = (W_lo +TPB -1)//TPB
    bpy = (H_lo +TPB -1)//TPB
    bpz = B_lo * C_lo
    griddim = (bpx,bpy,bpz)
    blockdim = (TPB,TPB,1)
    maxpooling_backward[griddim,blockdim](X,LO,dout)
    cuda.synchronize()
    return dout

In [None]:
def run_concat_cuda(x_np_1,x_np_2,TPB=16):
    B, C, H, W = x_np_1.shape
    C_out = C*2
    @cuda.jit(cache=True)
    def Concat_Cuda(inp_1,inp_2, out):
        # merge batch và channel vào blockIdx.z
        z = cuda.blockIdx.z
        b = z // C_out
        c = z % C_out
    
        # tọa độ output
        x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        if y<H and x<W:
            if c<C:
                out[b, c, y, x] = inp_1[b, c, y, x];
            else:
                out[b, c, y, x] = inp_2[b, c-C, y, x]
    
    d_out = cuda.device_array((B, C_out, H, W), dtype = np.float32)
    bpg_x = (H + TPB - 1) // TPB
    bpg_y = (W + TPB - 1) // TPB
    bpg_z = B * C_out
     
    grid_dims  = (bpg_x, bpg_y, bpg_z)
    block_dims = (TPB, TPB, 1)

    Concat_Cuda[grid_dims, block_dims](
        x_np_1,x_np_2, d_out
    )
    cuda.synchronize()
    return d_out

In [None]:
def run_sum_matrix(inp_1,inp_2,TPB=16):
    B,C,H,W = inp_1.shape
    bpx = (H + TPB - 1)//TPB
    bpy = (W + TPB - 1)//TPB
    @cuda.jit(cache = True)
    def sum_matrix(matrix_1,matrix_2,out):
        x = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
        y = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y
        z = cuda.blockIdx.z
        b = z//C
        c = z%C
        if x<H and y<W:
            out[b,c,y,x] = matrix_1[b,c,y,x] + matrix_2[b,c,y,x]
    dout = cuda.device_array((B,C,H,W),dtype = np.float32)
    bpz = B*C
    griddim = (bpx,bpy,bpz)
    blockdim = (TPB,TPB,1)
    sum_matrix[griddim,blockdim](inp_1,inp_2,dout)
    cuda.synchronize()
    return dout

In [None]:
def fire_module_cuda(x,s_w1,s_b1,e_w1,e_b1,e_w3,e_b3):
    squeeze = run_conv2d_cuda(x, s_w1, s_b1,P=0, S=1, TPB=16)                   #8x96x55x55 ==> 8x16x55x55
    relu_squeeze = run_Relu_Cuda(squeeze,TPB=16)                            #8x16x55x55
    expand_1x1_cu = run_conv2d_cuda(relu_squeeze, e_w1, e_b1,P=0, S=1, TPB=16)    #8x16x55x55 ==> 8x64x55x55
    expand_3x3_cu = run_conv2d_cuda(relu_squeeze, e_w3, e_b3,P=1, S=1, TPB=16)   #8x16x55x55 ==> 8x64x55x55
    concat = run_concat_cuda(expand_1x1_cu,expand_3x3_cu,TPB=16)                            #8x128x55x55
    firemodule_out=run_Relu_Cuda(concat,TPB=16)
    return {
        "squeeze":squeeze,
        "relu_squeeze":relu_squeeze,
        "expand_1x1":expand_1x1_cu,
        "expand_3x3":expand_3x3_cu,
        "concat":concat,
        "firemodule_out":firemodule_out
    }

In [None]:
def run_firemodule_backward(X_previous_layer,
                            weights_squeeze1x1,bias_squeeze1x1,VFs1,VBs1,MFs1,MBs1,
                            X_squeeze,X_relu_squeeze,
                            weights_expand1x1,bias_expand1x1,VFe1,VBe1,MFe1,MBe1,
                            weights_expand3x3,bias_expand3x3,VFe3,VBe3,MFe3,MBe3,
                            X_concat,LO,step):
    L_concat = run_relu_backward(X_concat,LO,TPB=16)
    C_out = L_concat.shape[1]//2
    L_e_1,L_e_3 = L_concat[:, :C_out, :, :], L_concat[:, C_out:, :, :]
    LX_1_out,LF_W_1_out,LF_B_1_out = run_conv_backward(X_relu_squeeze,weights_expand1x1,bias_expand1x1,L_e_1,VFe1,VBe1,MFe1,MBe1,step,stride=1,padding=0, TPB=16)
    LX_3_out,LF_W_3_out,LF_B_3_out = run_conv_backward(X_relu_squeeze,weights_expand3x3,bias_expand3x3,L_e_3,VFe3,VBe3,MFe3,MBe3,step,stride=1,padding=1,TPB=16)
    sum_matrix = run_sum_matrix(LX_3_out,LX_1_out,TPB=16)
    S_1x1 = run_relu_backward(X_squeeze,sum_matrix,TPB=16)
    LX_pre_out,LF_pre_out,LB_pre_out = run_conv_backward(X_previous_layer,weights_squeeze1x1,bias_squeeze1x1,S_1x1,VFs1,VBs1,MFs1,MBs1,step,stride=1,padding=0, TPB=16)
    return {"LO":LX_pre_out,
            "LF_s1x1":LF_pre_out,
            "LB_s1x1":LB_pre_out,
            "LF_e1x1":LF_W_1_out,
            "LB_e1x1":LF_B_1_out,
            "LF_e3x3":LF_W_3_out,
            "LB_e3x3":LF_B_3_out,
            }

In [None]:
def squeezenet_forward_cuda(x_cuda,weights_cuda):
    conv1           = run_conv2d_cuda_shared_2(x_cuda, weights_cuda['features.0.weight'],weights_cuda['features.0.bias'],P=3, S=2, TPB=16)
    conv1_relu      = run_Relu_Cuda(conv1,TPB=16)
    maxpool1     = run_maxpool2d_cuda(conv1_relu,P=0, S=2, TPB=32)
    ########################-----Fire2----##################################
    prefix = f'features.{3}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']     
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire2=fire_module_cuda(maxpool1,sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire2----##################################
    ########################-----Fire3----##################################
    prefix = f'features.{4}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire3=fire_module_cuda(fire2["firemodule_out"],sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire3----##################################
    ########################-----Fire4----##################################
    prefix = f'features.{5}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire4=fire_module_cuda(fire3["firemodule_out"],sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire4----##################################
    maxpool4     = run_maxpool2d_cuda(fire4["firemodule_out"],P=0, S=2, TPB=32)
    ########################-----Fire5----##################################
    prefix = f'features.{7}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire5=fire_module_cuda(maxpool4,sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire5----##################################
    ########################-----Fire6----##################################
    prefix = f'features.{8}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire6=fire_module_cuda(fire5["firemodule_out"],sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire6----##################################
    ########################-----Fire7----##################################
    prefix = f'features.{9}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire7=fire_module_cuda(fire6["firemodule_out"],sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire7----##################################
    ########################-----Fire8----##################################
    prefix = f'features.{10}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire8=fire_module_cuda(fire7["firemodule_out"],sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire8----##################################
    maxpool8     = run_maxpool2d_cuda(fire8["firemodule_out"],P=0, S=2, TPB=32)
    ########################-----Fire9----##################################
    prefix = f'features.{12}'
    sw = weights_cuda[f'{prefix}.squeeze.weight']
    sb = weights_cuda[f'{prefix}.squeeze.bias']
    ew1 = weights_cuda[f'{prefix}.expand1x1.weight']
    eb1 = weights_cuda[f'{prefix}.expand1x1.bias']
    ew3 = weights_cuda[f'{prefix}.expand3x3.weight']
    eb3 = weights_cuda[f'{prefix}.expand3x3.bias']
    TPB = [16,16,16,16,16,16]
    fire9=fire_module_cuda(maxpool8,sw,sb,ew1,eb1,ew3,eb3)
    ########################-----Fire9----##################################
    conv10 = run_conv2d_cuda(fire9["firemodule_out"],weights_cuda['classifier.0.weight'],weights_cuda['classifier.0.bias'],P=0, S=1, TPB=8)
    avgpool10     = run_global_average_Cuda(conv10,TPB=8)
    # output = run_softmax_cuda(avgpool10)
    return {
        'conv1': conv1,
        'conv1_relu': conv1_relu,
        'maxpool1': maxpool1,
        'fire2': fire2,
        'fire3': fire3,
        'fire4': fire4,
        'maxpool4': maxpool4,
        'fire5': fire5,
        'fire6': fire6,
        'fire7': fire7,
        'fire8': fire8,
        'maxpool8': maxpool8,
        'fire9': fire9,
        'conv10': conv10,
        'avgpool10': avgpool10,
        # "output":output
    }

In [None]:
def backward_squeezenet_cuda(X_input,X,weight,Y_cuda,V,M,step):
    X_output = run_softmax_cuda(X["avgpool10"])
    avgpool10_lo = run_softmax_backward(X_output,Y_cuda)
    conv10_lo = run_avgpool_backward(avgpool10_lo)
    fire9_lo,conv10_lw,conv10_lb=run_conv_backward(X["fire9"]["firemodule_out"],
                                                   weight["classifier.0.weight"], weight["classifier.0.bias"],
                                                   conv10_lo,
                                                   V["classifier.0.weight"], V["classifier.0.bias"],
                                                   M["classifier.0.weight"], M["classifier.0.bias"],
                                                   step,
                                                   stride=1,padding=0,TPB=16)
    maxpool8_lo = run_firemodule_backward(X["maxpool8"],
                                            weight["features.12.squeeze.weight"],weight["features.12.squeeze.bias"],
                                            V["features.12.squeeze.weight"],V["features.12.squeeze.bias"],
                                            M["features.12.squeeze.weight"],M["features.12.squeeze.bias"],
                                            X["fire9"]["squeeze"],X["fire9"]["relu_squeeze"],
                                            weight["features.12.expand1x1.weight"],weight["features.12.expand1x1.bias"],
                                            V["features.12.expand1x1.weight"],V["features.12.expand1x1.bias"],
                                            M["features.12.expand1x1.weight"],M["features.12.expand1x1.bias"],
                                            weight["features.12.expand3x3.weight"],weight["features.12.expand3x3.bias"],
                                            V["features.12.expand3x3.weight"],V["features.12.expand3x3.bias"],
                                            M["features.12.expand3x3.weight"],M["features.12.expand3x3.bias"],
                                            X["fire9"]["concat"],fire9_lo,step)
    fire8_lo = run_maxpooling_backward(X["fire8"]["firemodule_out"],maxpool8_lo["LO"],K=3,stride=2)
    fire7_lo = run_firemodule_backward(X["fire7"]["firemodule_out"],
                                                 weight["features.10.squeeze.weight"],weight["features.10.squeeze.bias"],
                                                 V["features.10.squeeze.weight"],V["features.10.squeeze.bias"],
                                                 M["features.10.squeeze.weight"],M["features.10.squeeze.bias"],
                                                 X["fire8"]["squeeze"],X["fire8"]["relu_squeeze"],
                                                 weight["features.10.expand1x1.weight"],weight["features.10.expand1x1.bias"],
                                                 V["features.10.expand1x1.weight"],V["features.10.expand1x1.bias"],
                                                 M["features.10.expand1x1.weight"],M["features.10.expand1x1.bias"],
                                                 weight["features.10.expand3x3.weight"],weight["features.10.expand3x3.bias"],
                                                 V["features.10.expand3x3.weight"],V["features.10.expand3x3.bias"],
                                                 M["features.10.expand3x3.weight"],M["features.10.expand3x3.bias"],
                                                 X["fire8"]["concat"],fire8_lo,step)
    fire6_lo = run_firemodule_backward(X["fire6"]["firemodule_out"],
                                                 weight["features.9.squeeze.weight"],weight["features.9.squeeze.bias"],
                                                 V["features.9.squeeze.weight"],V["features.9.squeeze.bias"],
                                                 M["features.9.squeeze.weight"],M["features.9.squeeze.bias"],
                                                 X["fire7"]["squeeze"],X["fire7"]["relu_squeeze"],
                                                 weight["features.9.expand1x1.weight"],weight["features.9.expand1x1.bias"],
                                                 V["features.9.expand1x1.weight"],V["features.9.expand1x1.bias"],
                                                 M["features.9.expand1x1.weight"],M["features.9.expand1x1.bias"],
                                                 weight["features.9.expand3x3.weight"],weight["features.9.expand3x3.bias"],
                                                 V["features.9.expand3x3.weight"],V["features.9.expand3x3.bias"],
                                                 M["features.9.expand3x3.weight"],M["features.9.expand3x3.bias"],
                                                 X["fire7"]["concat"],fire7_lo["LO"],step)
    fire5_lo = run_firemodule_backward(X["fire5"]["firemodule_out"],
                                                 weight["features.8.squeeze.weight"],weight["features.8.squeeze.bias"],
                                                 V["features.8.squeeze.weight"],V["features.8.squeeze.bias"],
                                                 M["features.8.squeeze.weight"],M["features.8.squeeze.bias"],
                                                 X["fire6"]["squeeze"],X["fire6"]["relu_squeeze"],
                                                 weight["features.8.expand1x1.weight"],weight["features.8.expand1x1.bias"],
                                                 V["features.8.expand1x1.weight"],V["features.8.expand1x1.bias"],
                                                 M["features.8.expand1x1.weight"],M["features.8.expand1x1.bias"],
                                                 weight["features.8.expand3x3.weight"],weight["features.8.expand3x3.bias"],
                                                 V["features.8.expand3x3.weight"],V["features.8.expand3x3.bias"],
                                                 M["features.8.expand3x3.weight"],M["features.8.expand3x3.bias"], 
                                                 X["fire6"]["concat"],fire6_lo["LO"],step)
    maxpool4_lo = run_firemodule_backward(X["maxpool4"],
                                                 weight["features.7.squeeze.weight"],weight["features.7.squeeze.bias"],
                                                 V["features.7.squeeze.weight"],V["features.7.squeeze.bias"],
                                                 M["features.7.squeeze.weight"],M["features.7.squeeze.bias"],
                                                 X["fire5"]["squeeze"],X["fire5"]["relu_squeeze"],
                                                 weight["features.7.expand1x1.weight"],weight["features.7.expand1x1.bias"],
                                                 V["features.7.expand1x1.weight"],V["features.7.expand1x1.bias"],
                                                 M["features.7.expand1x1.weight"],M["features.7.expand1x1.bias"],
                                                 weight["features.7.expand3x3.weight"],weight["features.7.expand3x3.bias"],
                                                 V["features.7.expand3x3.weight"],V["features.7.expand3x3.bias"],
                                                 M["features.7.expand3x3.weight"],M["features.7.expand3x3.bias"],
                                                 X["fire5"]["concat"],fire5_lo["LO"],step)
    fire4_lo = run_maxpooling_backward(X["fire4"]["firemodule_out"],maxpool4_lo["LO"],K=3,stride=2)
    fire3_lo = run_firemodule_backward(X["fire3"]["firemodule_out"],
                                                 weight["features.5.squeeze.weight"],weight["features.5.squeeze.bias"],
                                                 V["features.5.squeeze.weight"],V["features.5.squeeze.bias"],
                                                 M["features.5.squeeze.weight"],M["features.5.squeeze.bias"],
                                                 X["fire4"]["squeeze"],X["fire4"]["relu_squeeze"],
                                                 weight["features.5.expand1x1.weight"],weight["features.5.expand1x1.bias"],
                                                 V["features.5.expand1x1.weight"],V["features.5.expand1x1.bias"],
                                                 M["features.5.expand1x1.weight"],M["features.5.expand1x1.bias"],
                                                 weight["features.5.expand3x3.weight"],weight["features.5.expand3x3.bias"],
                                                 V["features.5.expand3x3.weight"],V["features.5.expand3x3.bias"],
                                                 M["features.5.expand3x3.weight"],M["features.5.expand3x3.bias"],
                                                 X["fire4"]["concat"],fire4_lo,step)
    fire2_lo = run_firemodule_backward(X["fire2"]["firemodule_out"],
                                                 weight["features.4.squeeze.weight"],weight["features.4.squeeze.bias"],
                                                 V["features.4.squeeze.weight"],V["features.4.squeeze.bias"],
                                                 M["features.4.squeeze.weight"],M["features.4.squeeze.bias"],
                                                 X["fire3"]["squeeze"],X["fire3"]["relu_squeeze"],
                                                 weight["features.4.expand1x1.weight"],weight["features.4.expand1x1.bias"],
                                                 V["features.4.expand1x1.weight"],V["features.4.expand1x1.bias"],
                                                 M["features.4.expand1x1.weight"],M["features.4.expand1x1.bias"],
                                                 weight["features.4.expand3x3.weight"],weight["features.4.expand3x3.bias"],
                                                 V["features.4.expand3x3.weight"],V["features.4.expand3x3.bias"],
                                                 M["features.4.expand3x3.weight"],M["features.4.expand3x3.bias"],
                                                 X["fire3"]["concat"],fire3_lo["LO"],step)
    maxpool1_lo = run_firemodule_backward(X["maxpool1"],
                                             weight["features.3.squeeze.weight"],weight["features.3.squeeze.bias"],
                                             V["features.3.squeeze.weight"],V["features.3.squeeze.bias"],
                                             M["features.3.squeeze.weight"],M["features.3.squeeze.bias"],
                                             X["fire2"]["squeeze"],X["fire2"]["relu_squeeze"],
                                             weight["features.3.expand1x1.weight"],weight["features.3.expand1x1.bias"],
                                             V["features.3.expand1x1.weight"],V["features.3.expand1x1.bias"],
                                             M["features.3.expand1x1.weight"],M["features.3.expand1x1.bias"],
                                             weight["features.3.expand3x3.weight"],weight["features.3.expand3x3.bias"],
                                             V["features.3.expand3x3.weight"],V["features.3.expand3x3.bias"],
                                             M["features.3.expand3x3.weight"],M["features.3.expand3x3.bias"],
                                             X["fire2"]["concat"],fire2_lo["LO"],step)
    conv1_relu_lo = run_maxpooling_backward(X["conv1_relu"],maxpool1_lo["LO"],K=3,stride=2)
    conv1_lo = run_relu_backward(X["conv1"],conv1_relu_lo,TPB=16)
    # input_image_loss = run_conv_backward(x_cuda,weight["features.0.weight"],weight["features.0.bias"],conv1_lo,stride=2,padding=3, TPB=16)
    bias_backward =  run_bias_backward(conv1_lo,
                                       weight["features.0.bias"],
                                       V["features.0.bias"],
                                       M["features.0.bias"],
                                       step)
    F_backward = run_filter_conv_backward(X_input,
                                          weight["features.0.weight"],
                                          conv1_lo,
                                          V["features.0.weight"],
                                          M["features.0.weight"],
                                          step,
                                          2,3)
    # X_backward = run_input_conv_backward(X,F,LO,stride,padding,TPB)
    # bias_backward =  run_bias_backward(LO,B)
    return { 'features.0.weight':F_backward,
             'features.0.bias':   bias_backward,
             'features.3.squeeze.weight': maxpool1_lo["LF_s1x1"],
             'features.3.squeeze.bias': maxpool1_lo["LB_s1x1"],
             'features.3.expand1x1.weight': maxpool1_lo["LF_e1x1"],
             'features.3.expand1x1.bias': maxpool1_lo["LB_e1x1"],
             'features.3.expand3x3.weight': maxpool1_lo["LF_e3x3"],
             'features.3.expand3x3.bias':maxpool1_lo["LB_e3x3"],
             'features.4.squeeze.weight': fire2_lo["LF_s1x1"],
             'features.4.squeeze.bias': fire2_lo["LB_s1x1"],
             'features.4.expand1x1.weight': fire2_lo["LF_e1x1"],
             'features.4.expand1x1.bias': fire2_lo["LB_e1x1"],
             'features.4.expand3x3.weight': fire2_lo["LF_e3x3"],
             'features.4.expand3x3.bias':fire2_lo["LB_e3x3"],
             'features.5.squeeze.weight': fire3_lo["LF_s1x1"],
             'features.5.squeeze.bias': fire3_lo["LB_s1x1"],
             'features.5.expand1x1.weight': fire3_lo["LF_e1x1"],
             'features.5.expand1x1.bias': fire3_lo["LB_e1x1"],
             'features.5.expand3x3.weight': fire3_lo["LF_e3x3"],
             'features.5.expand3x3.bias':fire3_lo["LB_e3x3"],
             'features.7.squeeze.weight': maxpool4_lo["LF_s1x1"],
             'features.7.squeeze.bias': maxpool4_lo["LB_s1x1"],
             'features.7.expand1x1.weight': maxpool4_lo["LF_e1x1"],
             'features.7.expand1x1.bias': maxpool4_lo["LB_e1x1"],
             'features.7.expand3x3.weight': maxpool4_lo["LF_e3x3"],
             'features.7.expand3x3.bias':maxpool4_lo["LB_e3x3"],
             'features.8.squeeze.weight': fire5_lo["LF_s1x1"],
             'features.8.squeeze.bias': fire5_lo["LB_s1x1"],
             'features.8.expand1x1.weight': fire5_lo["LF_e1x1"],
             'features.8.expand1x1.bias': fire5_lo["LB_e1x1"],
             'features.8.expand3x3.weight': fire5_lo["LF_e3x3"],
             'features.8.expand3x3.bias':fire5_lo["LB_e3x3"],
             'features.9.squeeze.weight': fire6_lo["LF_s1x1"],
             'features.9.squeeze.bias': fire6_lo["LB_s1x1"],
             'features.9.expand1x1.weight': fire6_lo["LF_e1x1"],
             'features.9.expand1x1.bias': fire6_lo["LB_e1x1"],
             'features.9.expand3x3.weight': fire6_lo["LF_e3x3"],
             'features.9.expand3x3.bias':fire6_lo["LB_e3x3"],
             'features.10.squeeze.weight': fire7_lo["LF_s1x1"],
             'features.10.squeeze.bias': fire7_lo["LB_s1x1"],
             'features.10.expand1x1.weight': fire7_lo["LF_e1x1"],
             'features.10.expand1x1.bias': fire7_lo["LB_e1x1"],
             'features.10.expand3x3.weight': fire7_lo["LF_e3x3"],
             'features.10.expand3x3.bias':fire7_lo["LB_e3x3"],
             'features.12.squeeze.weight': maxpool8_lo["LF_s1x1"],
             'features.12.squeeze.bias': maxpool8_lo["LB_s1x1"],
             'features.12.expand1x1.weight': maxpool8_lo["LF_e1x1"],
             'features.12.expand1x1.bias': maxpool8_lo["LB_e1x1"],
             'features.12.expand3x3.weight': maxpool8_lo["LF_e3x3"],
             'features.12.expand3x3.bias':maxpool8_lo["LB_e3x3"],
             'classifier.0.weight': conv10_lw,
             'classifier.0.bias': conv10_lb}

In [None]:
import torch.nn.functional as F
class Fire(nn.Module):
    def __init__(self, in_channels, squeeze_channels, expand1x1_channels, expand3x3_channels):
        super().__init__()
        self.squeeze      = nn.Conv2d(in_channels, squeeze_channels, kernel_size=1)
        self.expand1x1    = nn.Conv2d(squeeze_channels, expand1x1_channels, kernel_size=1)
        self.expand3x3    = nn.Conv2d(squeeze_channels, expand3x3_channels, kernel_size=3, padding=1)

    def forward(self, x, return_intermediates=False):
        squeeze_out = self.squeeze(x)
        relu_squeeze = F.relu(squeeze_out)
        expand1x1_out = self.expand1x1(relu_squeeze)
        expand3x3_out = self.expand3x3(relu_squeeze)
        concat = torch.cat([expand1x1_out, expand3x3_out], dim=1)
        fire_out = F.relu(concat)
        return fire_out
class SqueezeNetManual(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=7, stride=2, padding=3),  # conv1
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),                  # maxpool1

            Fire(96,  16,  64,  64),   # fire2
            Fire(128, 16,  64,  64),   # fire3
            Fire(128, 32, 128, 128),   # fire4
            nn.MaxPool2d(kernel_size=3, stride=2),                  # maxpool4

            Fire(256, 32, 128, 128),   # fire5
            Fire(256, 48, 192, 192),   # fire6
            Fire(384, 48, 192, 192),   # fire7
            Fire(384, 64, 256, 256),   # fire8
            nn.MaxPool2d(kernel_size=3, stride=2),                  # maxpool8

            Fire(512, 64, 256, 256),   # fire9
        )
        self.classifier = nn.Sequential(
            nn.Conv2d(512, num_classes, kernel_size=1),            # conv10
        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # === Thêm: Lưu gradient ===
        self.gradients = {}
        self._register_gradient_hooks()

    def _register_gradient_hooks(self):
        """
        Đăng ký backward hook cho tất cả các tham số có requires_grad=True
        """
        for name, param in self.named_parameters():
            def make_hook(name):
                def hook(grad):
                    self.gradients[name] = grad.clone()  # Lưu bản sao của gradient
                return hook

            param.register_hook(make_hook(name))

    def clear_gradients(self):
        """Xóa gradient đã lưu từ bước trước"""
        self.gradients.clear()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        x = self.avgpool(x)
        return x.view(x.size(0), -1)

In [None]:
DIR_ROOT = '/kaggle/input/tomato-diseases'
DIR_TRAIN = os.path.join(DIR_ROOT, 'train')
DIR_TEST  = os.path.join(DIR_ROOT, 'test')


transform = transforms.Compose([
    transforms.Resize((224,224),        # bilinear là mặc định cho Resize
                       interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),              # → FloatTensor trong [0,1]
    transforms.Lambda(lambda x: x * 2.0 - 1.0),  # → FloatTensor trong [-1,1]
])

train_ds = datasets.ImageFolder(DIR_TRAIN, transform=transform)
test_ds  = datasets.ImageFolder(DIR_TEST,  transform=transform)
categories = train_ds.classes  # danh sách tên lớp

In [None]:
# Parameters
BATCH_SIZE     = 32
SHUFFLE        = True
train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=SHUFFLE,
)

val_loader = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,           # thường không shuffle validation
)

# Kiểm tra nhanh:
print(f'Number of training samples: {len(train_loader)}')
print(f'Number of validation samples: {len(val_loader)}')
print(f'Classes: {categories}')

In [None]:
import torch.optim as optim
model = SqueezeNetManual(num_classes=10).to("cuda",dtype=torch.float32)
state = model.state_dict()

In [None]:
weights_np = {k: v.cpu().numpy() for k, v in state.items()}
M_adam_np = {k: np.zeros_like(v) for k, v in weights_np.items()}
V_adam_np = {k: np.zeros_like(v) for k, v in weights_np.items()}

In [None]:
from tqdm import tqdm
epochs = 5
step = 0
Loss_container = []
Acc_container = []
for epoch in range(epochs):
    print(M_adam_np["classifier.0.bias"].flatten())
    print(V_adam_np["classifier.0.bias"].flatten())
    print(weights_np["classifier.0.bias"].flatten())
    print(cuda.current_context().get_memory_info())
    print(f"\nEpoch {epoch + 1}/{epochs}")
    weights_cuda = {k: cuda.to_device(v) for k, v in weights_np.items()}
    M_adam_cuda = {k: cuda.to_device(v) for k, v in M_adam_np.items()}
    V_adam_cuda = {k: cuda.to_device(v) for k, v in V_adam_np.items()}
    running_loss, running_corr, running_samples = 0.0, 0, 0
    train_iter = tqdm(train_loader, desc=f"[Epoch {epoch}/{epochs}] Training", leave=False)
    for imgs, labels in train_iter:
        step +=1
        x_cuda = cuda.to_device(imgs.cpu().numpy())
        y_np   = labels.cpu().numpy()
        # One-hot encode
        Y = np.zeros((y_np.size, 10), dtype=np.float32)
        Y[np.arange(y_np.size), y_np] = 1
        Y_cuda = cuda.to_device(Y)
        
        # Forward
        x_cuda_forward = squeezenet_forward_cuda(x_cuda, weights_cuda)
        X_output = run_softmax_cuda(x_cuda_forward["avgpool10"])

        # Loss
        epsilon = 1e-20
        batch_loss = 0.0
        for m in range(X_output.shape[0]):
            for n in range(X_output.shape[1]):
                if Y_cuda[m][n] == 1:
                    batch_loss -= math.log(X_output[m][n] + epsilon)

        running_loss += batch_loss
       
        # Accuracy
        output_cpu = X_output.copy_to_host()
        predicted = np.argmax(output_cpu, axis=1)
        running_corr += np.sum(predicted == y_np)
        running_samples += y_np.shape[0]
        memory= cuda.current_context().get_memory_info()
        # cuda.current_context().memory_manager.deallocations.clear()
        # Update postfix
        train_iter.set_postfix(loss=running_loss / running_samples,
                               acc=running_corr / running_samples,
                               freemem = memory)
        weights_cuda = backward_squeezenet_cuda(x_cuda,x_cuda_forward,weights_cuda,Y_cuda,
                                                M_adam_cuda,V_adam_cuda,step)
    train_loss = running_loss / running_samples
    train_acc  = running_corr / running_samples
    weights_np = {k: v.copy_to_host() for k, v in weights_cuda.items()}
    M_adam_np  = {k: v.copy_to_host() for k, v in M_adam_cuda.items()}
    V_adam_np  = {k: v.copy_to_host() for k, v in V_adam_cuda.items()}
    cuda.get_current_device().reset()
    Loss_container.append(train_loss)
    Acc_container.append(train_acc)
    print(f"Epoch {epoch + 1}/{epochs} | "
          f"Train loss: {train_loss:.4f}, acc: {train_acc:.4f}")
    print(Loss_container)
    print(Acc_container)
np.savez(f'model_fintuning_by_Adam.npz', 
                 weights=weights_np, 
                 m_adam=M_adam_np, 
                 v_adam=V_adam_np)