test modules

In [1]:
%run homework_modules.ipynb

--- GlobalMaxPool2d Test ---
Input shape: (2, 3, 4, 5)
Output shape: (2, 3), Expected shape: (2, 3)
Output matches expected: True
Gradient input shape: (2, 3, 4, 5)
All zero-masked grads are zero: True
Number of non-zero grads: 6

--- GlobalAvgPool2d Test ---
Input shape: (2, 3, 4, 5)
Output shape: (2, 3), Expected shape: (2, 3)
Output matches expected: True
Gradient input shape: (2, 3, 4, 5)
Gradient input matches expected: True


In [2]:
import torch
from torch.autograd import Variable
import unittest

In [3]:
class TestLayers(unittest.TestCase):
    def assertNumpyClose(self, a, b, atol=1e-6, rtol=1e-5, msg=''):
        self.assertTrue(np.allclose(a, b, atol=atol, rtol=rtol),
                        msg=f"{msg}\nExpected:\n{b}\nGot:\n{a}\nDifference:\n{a-b}\nMax Diff: {np.max(np.abs(a-b))}")

    def test_Linear(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in, n_out = 2, 3, 4
        for _ in range(100):
            torch_layer = torch.nn.Linear(n_in, n_out)
            custom_layer = Linear(n_in, n_out)
            custom_layer.W = torch_layer.weight.data.numpy().copy()
            custom_layer.b = torch_layer.bias.data.numpy().copy()
            layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-10, 10, (batch_size, n_out)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), msg="Linear forward")

            custom_layer.zeroGradParameters()
            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), msg="Linear gradInput")

            weight_grad = custom_layer.gradW
            bias_grad = custom_layer.gradb
            torch_weight_grad = torch_layer.weight.grad.data.numpy()
            torch_bias_grad = torch_layer.bias.grad.data.numpy()
            self.assertNumpyClose(weight_grad, torch_weight_grad, msg="Linear gradW")
            self.assertNumpyClose(bias_grad, torch_bias_grad, msg="Linear gradb")

    def test_SoftMax(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 2, 4
        for _ in range(100):
            torch_layer = torch.nn.Softmax(dim=1)
            custom_layer = SoftMax()
            layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-1, 1, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), atol=1e-7, msg="SoftMax forward")

            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), atol=1e-7, msg="SoftMax gradInput")

    def test_LogSoftMax(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 2, 4
        for _ in range(100):
            torch_layer = torch.nn.LogSoftmax(dim=1)
            custom_layer = LogSoftMax()
            layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-1, 1, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), msg="LogSoftMax forward")

            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), msg="LogSoftMax gradInput")

    def test_BatchNormalization(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 32, 16
        for _ in range(100):
            momentum = 0.1
            eps = BatchNormalization.EPS

            # --- Train ---
            custom_layer = BatchNormalization(n_in, momentum=momentum)
            custom_layer.train()
            torch_layer = torch.nn.BatchNorm1d(n_in, eps=eps, momentum=momentum, affine=False)
            torch_layer.train()
            custom_layer.moving_mean = torch_layer.running_mean.numpy().copy()
            custom_layer.moving_variance = torch_layer.running_var.numpy().copy()
            layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), atol=1e-6, msg="BN forward (train)")

            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), atol=1e-5, msg="BN gradInput (train)")

            self.assertNumpyClose(custom_layer.moving_mean, torch_layer.running_mean.numpy(), atol=1e-6, msg="BN moving_mean")

            # --- Eval ---
            custom_layer.evaluate()
            torch_layer.eval()
            eval_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            eval_next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output_eval = custom_layer.forward(eval_input)
            eval_input_var = Variable(torch.from_numpy(eval_input), requires_grad=True)
            torch_layer_output_eval_var = torch_layer(eval_input_var)
            self.assertNumpyClose(custom_layer_output_eval, torch_layer_output_eval_var.data.numpy(), atol=1e-6, msg="BN forward (eval)")

            custom_layer_grad_eval = custom_layer.backward(eval_input, eval_next_layer_grad)
            if eval_input_var.grad is not None: eval_input_var.grad.zero_()
            torch_layer_output_eval_var.backward(torch.from_numpy(eval_next_layer_grad))
            torch_layer_grad_eval_var = eval_input_var.grad
            self.assertNumpyClose(custom_layer_grad_eval, torch_layer_grad_eval_var.data.numpy(), atol=1e-6, msg="BN gradInput (eval)")

    def test_Sequential_BatchNormAffine(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 32, 16
        for _ in range(100):
            momentum = 0.1
            eps = BatchNormalization.EPS

            # --- Train ---
            torch_layer = torch.nn.BatchNorm1d(n_in, eps=eps, momentum=momentum, affine=True)
            torch_layer.train()
            torch_layer.weight.data = torch.from_numpy(np.random.rand(n_in).astype(np.float32))
            torch_layer.bias.data = torch.from_numpy(np.random.rand(n_in).astype(np.float32))

            custom_layer = Sequential()
            bn_layer = BatchNormalization(n_in, momentum=momentum)
            scaling_layer = ChannelwiseScaling(n_in)
            bn_layer.moving_mean = torch_layer.running_mean.numpy().copy()
            bn_layer.moving_variance = torch_layer.running_var.numpy().copy()
            scaling_layer.gamma = torch_layer.weight.data.numpy().copy()
            scaling_layer.beta = torch_layer.bias.data.numpy().copy()
            custom_layer.add(bn_layer)
            custom_layer.add(scaling_layer)
            custom_layer.train()

            layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), atol=1e-6, msg="Sequential BN forward (train)")

            custom_layer.zeroGradParameters()
            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), atol=1e-5, rtol=1e-4, msg="Sequential BN gradInput (train)")

            grad_params = custom_layer.getGradParameters()
            self.assertEqual(len(grad_params), 2)
            weight_grad = grad_params[0]; bias_grad = grad_params[1]
            torch_weight_grad = torch_layer.weight.grad.data.numpy()
            torch_bias_grad = torch_layer.bias.grad.data.numpy()
            self.assertNumpyClose(weight_grad, torch_weight_grad, atol=1e-5, rtol=1e-4, msg="Sequential BN gradGamma (train)")
            self.assertNumpyClose(bias_grad, torch_bias_grad, atol=1e-5, rtol=1e-4, msg="Sequential BN gradBeta (train)")

            # --- Eval ---
            custom_layer.evaluate()
            torch_layer.eval()
            eval_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            eval_next_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output_eval = custom_layer.forward(eval_input)
            eval_input_var = Variable(torch.from_numpy(eval_input), requires_grad=True)
            torch_layer_output_eval_var = torch_layer(eval_input_var)
            self.assertNumpyClose(custom_layer_output_eval, torch_layer_output_eval_var.data.numpy(), atol=1e-6, msg="Sequential BN forward (eval)")

            custom_layer_grad_eval = custom_layer.backward(eval_input, eval_next_grad)
            if eval_input_var.grad is not None: eval_input_var.grad.zero_()
            if torch_layer.weight.grad is not None: torch_layer.weight.grad.zero_()
            if torch_layer.bias.grad is not None: torch_layer.bias.grad.zero_()
            torch_layer_output_eval_var.backward(torch.from_numpy(eval_next_grad))
            torch_layer_grad_eval_var = eval_input_var.grad
            self.assertNumpyClose(custom_layer_grad_eval, torch_layer_grad_eval_var.data.numpy(), atol=1e-6, msg="Sequential BN gradInput (eval)")

    def test_Dropout(self):
        np.random.seed(42)
        batch_size, n_in = 100, 50
        for p in [0.0, 0.3, 0.5, 0.8]:
          with self.subTest(p=p):
            layer = Dropout(p)
            layer_input = np.random.uniform(1, 5, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(1, 5, (batch_size, n_in)).astype(np.float32)

            # --- Train ---
            layer.train()
            layer_output = layer.forward(layer_input)
            layer_grad = layer.backward(layer_input, next_layer_grad)
            scale = 1.0 / (1.0 - p) if p < 1.0 else 0.0
            is_zero_mask_out = np.isclose(layer_output, 0)
            is_scaled_mask_out = np.isclose(layer_output, layer_input * scale)
            self.assertTrue(np.all(np.logical_or(is_zero_mask_out, is_scaled_mask_out)), msg=f"Dropout forward (train, p={p}) values")
            zero_fraction = np.mean(is_zero_mask_out)
            self.assertAlmostEqual(zero_fraction, p, delta=0.05, msg=f"Dropout forward (train, p={p}) zero fraction")
            is_zero_mask_grad = np.isclose(layer_grad, 0)
            is_scaled_mask_grad = np.isclose(layer_grad, next_layer_grad * scale)
            self.assertTrue(np.all(np.logical_or(is_zero_mask_grad, is_scaled_mask_grad)), msg=f"Dropout backward (train, p={p}) grad values")
            self.assertTrue(np.all(is_zero_mask_out == is_zero_mask_grad), msg=f"Dropout masks match (train, p={p})")

            # --- Eval ---
            layer.evaluate()
            eval_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            eval_next_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            eval_output = layer.forward(eval_input)
            self.assertNumpyClose(eval_output, eval_input, msg=f"Dropout forward (eval, p={p})")
            eval_grad = layer.backward(eval_input, eval_next_grad)
            self.assertNumpyClose(eval_grad, eval_next_grad, msg=f"Dropout backward (eval, p={p})")

    def test_Conv2d(self):
        hyperparams = [
            {'batch_size': 2, 'in_channels': 3, 'out_channels': 4, 'height': 5, 'width': 6,'kernel_size': 3, 'stride': 1, 'padding': 1, 'bias': True},
            {'batch_size': 4, 'in_channels': 1, 'out_channels': 2, 'height': 7, 'width': 7,'kernel_size': (3,3), 'stride': 2, 'padding': (1,1), 'bias': False},
            {'batch_size': 2, 'in_channels': 2, 'out_channels': 3, 'height': 8, 'width': 6,'kernel_size': (2,3), 'stride': (1,2), 'padding': (0,1), 'bias': True},
            {'batch_size': 1, 'in_channels': 1, 'out_channels': 1, 'height': 3, 'width': 3,'kernel_size': 2, 'stride': 1, 'padding': 0, 'bias': False},
        ]
        np.random.seed(42)
        torch.manual_seed(42)
        for params in hyperparams:
              with self.subTest(params=params):
                  ks = params['kernel_size']; st = params['stride']; pd = params['padding']; b = params['bias']
                  custom_layer = Conv2d(params['in_channels'], params['out_channels'], ks, stride=st, padding=pd, bias=b, padding_mode='zeros')
                  torch_layer = torch.nn.Conv2d(params['in_channels'], params['out_channels'], ks, stride=st, padding=pd, bias=b, padding_mode='zeros')
                  custom_layer.W = torch_layer.weight.detach().numpy().copy()
                  if b: custom_layer.b = torch_layer.bias.detach().numpy().copy()
                  custom_layer.train(); torch_layer.train()
                  layer_input = np.random.randn(params['batch_size'], params['in_channels'], params['height'], params['width']).astype(np.float32)
                  input_var = torch.tensor(layer_input, requires_grad=True)

                  custom_output = custom_layer.forward(layer_input)
                  torch_output = torch_layer(input_var)
                  self.assertNumpyClose(custom_output, torch_output.detach().numpy(), atol=1e-6, msg=f"Conv2d forward {params}")

                  next_layer_grad = np.random.randn(*custom_output.shape).astype(np.float32)
                  custom_layer.zeroGradParameters()
                  custom_grad_input = custom_layer.backward(layer_input, next_layer_grad)
                  torch_output.backward(torch.tensor(next_layer_grad))
                  torch_grad_input = input_var.grad.detach().numpy()
                  self.assertNumpyClose(custom_grad_input, torch_grad_input, atol=1e-5, msg=f"Conv2d gradInput {params}")

                  custom_gradW = custom_layer.gradW
                  torch_gradW = torch_layer.weight.grad.detach().numpy()
                  self.assertNumpyClose(custom_gradW, torch_gradW, atol=1e-5, msg=f"Conv2d gradW {params}")
                  if b:
                      custom_gradb = custom_layer.gradb
                      torch_gradb = torch_layer.bias.grad.detach().numpy()
                      self.assertNumpyClose(custom_gradb, torch_gradb, atol=1e-5, msg=f"Conv2d gradb {params}")

    def test_LeakyReLU(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 2, 4
        for _ in range(100):
            slope = np.random.uniform(0.01, 0.2)
            torch_layer = torch.nn.LeakyReLU(slope)
            custom_layer = LeakyReLU(slope)
            layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), msg="LeakyReLU forward")

            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), msg="LeakyReLU gradInput")

    def test_ELU(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 2, 4
        for _ in range(100):
            alpha = np.random.uniform(0.5, 1.5)
            torch_layer = torch.nn.ELU(alpha)
            custom_layer = ELU(alpha)
            layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), msg="ELU forward")

            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), msg="ELU gradInput")

    def test_SoftPlus(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 2, 4
        for _ in range(100):
            torch_layer = torch.nn.Softplus()
            custom_layer = SoftPlus()
            layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

            custom_layer_output = custom_layer.forward(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            self.assertNumpyClose(custom_layer_output, torch_layer_output_var.data.numpy(), rtol=1e-4, msg="SoftPlus forward")

            custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            self.assertNumpyClose(custom_layer_grad, torch_layer_grad_var.data.numpy(), rtol=1e-4, msg="SoftPlus gradInput")

    def test_ClassNLLCriterionUnstable(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 5, 10
        for _ in range(100):
            custom_criterion = ClassNLLCriterionUnstable()
            torch_criterion = torch.nn.NLLLoss()
            logits = np.random.uniform(-2, 2, (batch_size, n_in)).astype(np.float32)
            probs = SoftMax().forward(logits)
            probs = np.clip(probs, custom_criterion.EPS, 1. - custom_criterion.EPS)
            probs /= probs.sum(axis=-1, keepdims=True)
            target_labels = np.random.randint(0, n_in, batch_size)
            target_one_hot = np.zeros((batch_size, n_in), np.float32)
            target_one_hot[np.arange(batch_size), target_labels] = 1.0

            custom_loss = custom_criterion.forward(probs, target_one_hot)
            log_probs_torch = torch.log(torch.from_numpy(probs))
            target_labels_torch = torch.from_numpy(target_labels).long()
            torch_loss = torch_criterion(log_probs_torch, target_labels_torch)
            self.assertAlmostEqual(custom_loss, torch_loss.item(), delta=1e-6, msg="NLLUnstable forward")

            custom_grad_input = custom_criterion.backward(probs, target_one_hot)
            log_probs_torch_var = Variable(log_probs_torch.data, requires_grad=True)
            log_probs_torch_var.retain_grad()
            torch_loss_for_grad = torch_criterion(log_probs_torch_var, target_labels_torch)
            torch_loss_for_grad.backward()
            torch_grad_log_probs = log_probs_torch_var.grad.data.numpy()
            expected_torch_grad = - target_one_hot / (probs * batch_size)
            self.assertNumpyClose(custom_grad_input, expected_torch_grad, atol=1e-6, msg="NLLUnstable gradInput")

    def test_ClassNLLCriterion(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 5, 10
        for _ in range(100):
            custom_criterion = ClassNLLCriterion()
            torch_criterion = torch.nn.NLLLoss()
            logits = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
            log_probs = LogSoftMax().forward(logits)
            target_labels = np.random.randint(0, n_in, batch_size)
            target_one_hot = np.zeros((batch_size, n_in), np.float32)
            target_one_hot[np.arange(batch_size), target_labels] = 1.0

            custom_loss = custom_criterion.forward(log_probs, target_one_hot)
            log_probs_torch = torch.from_numpy(log_probs)
            target_labels_torch = torch.from_numpy(target_labels).long()
            torch_loss = torch_criterion(log_probs_torch, target_labels_torch)
            self.assertAlmostEqual(custom_loss, torch_loss.item(), delta=1e-6, msg="NLLStable forward")

            custom_grad_input = custom_criterion.backward(log_probs, target_one_hot)
            log_probs_torch_var = Variable(log_probs_torch.data, requires_grad=True)
            torch_loss_for_grad = torch_criterion(log_probs_torch_var, target_labels_torch)
            torch_loss_for_grad.backward()
            torch_grad_input = log_probs_torch_var.grad.data.numpy()
            self.assertNumpyClose(custom_grad_input, torch_grad_input, atol=1e-7, msg="NLLStable gradInput")

    def test_MaxPool2d(self):
        hyperparams = [
            {'batch_size': 2, 'channels': 3, 'height': 5, 'width': 6, 'kernel_size': 2, 'stride': 2, 'padding': 0},
            {'batch_size': 4, 'channels': 1, 'height': 7, 'width': 7, 'kernel_size': 3, 'stride': 1, 'padding': 1},
            {'batch_size': 2, 'channels': 2, 'height': 8, 'width': 6, 'kernel_size': (2,3), 'stride': (2,1), 'padding': (1,0)},
            {'batch_size': 1, 'channels': 1, 'height': 4, 'width': 4, 'kernel_size': 4, 'stride': 4, 'padding': 0},
        ]
        np.random.seed(42)
        torch.manual_seed(42)
        for params in hyperparams:
          with self.subTest(params=params):
              ks = params['kernel_size']; st = params['stride']; pd = params['padding']
              custom_module = MaxPool2d(ks, stride=st, padding=pd)
              torch_module = torch.nn.MaxPool2d(ks, stride=st, padding=pd)
              input_np = np.random.randn(params['batch_size'], params['channels'], params['height'], params['width']).astype(np.float32)
              input_var = torch.tensor(input_np, requires_grad=True)

              custom_output = custom_module.forward(input_np)
              torch_output = torch_module(input_var)
              self.assertNumpyClose(custom_output, torch_output.detach().numpy(), atol=1e-7, msg=f"MaxPool2d forward {params}")

              next_grad = np.random.randn(*custom_output.shape).astype(np.float32)
              custom_grad = custom_module.backward(input_np, next_grad)
              torch_output.backward(torch.tensor(next_grad))
              torch_grad = input_var.grad.detach().numpy()
              self.assertNumpyClose(custom_grad, torch_grad, atol=1e-6, rtol=1e-5, msg=f"MaxPool2d gradInput {params}")

    def test_AvgPool2d(self):
        hyperparams = [
            {'batch_size': 2, 'channels': 3, 'height': 5, 'width': 6, 'kernel_size': 2, 'stride': 2, 'padding': 0},
            {'batch_size': 4, 'channels': 1, 'height': 7, 'width': 7, 'kernel_size': 3, 'stride': 1, 'padding': 1},
            {'batch_size': 2, 'channels': 2, 'height': 8, 'width': 6, 'kernel_size': (2,3), 'stride': (2,1), 'padding': (1,0)},
            {'batch_size': 1, 'channels': 1, 'height': 4, 'width': 4, 'kernel_size': 4, 'stride': 4, 'padding': 0},
        ]
        np.random.seed(42)
        torch.manual_seed(42)
        for params in hyperparams:
          with self.subTest(params=params):
              ks = params['kernel_size']; st = params['stride']; pd = params['padding']
              custom_module = AvgPool2d(ks, stride=st, padding=pd)
              torch_module = torch.nn.AvgPool2d(ks, stride=st, padding=pd)
              input_np = np.random.randn(params['batch_size'], params['channels'], params['height'], params['width']).astype(np.float32)
              input_var = torch.tensor(input_np, requires_grad=True)

              custom_output = custom_module.forward(input_np)
              torch_output = torch_module(input_var)
              self.assertNumpyClose(custom_output, torch_output.detach().numpy(), atol=1e-7, msg=f"AvgPool2d forward {params}")

              next_grad = np.random.randn(*custom_output.shape).astype(np.float32)
              custom_grad = custom_module.backward(input_np, next_grad)
              torch_output.backward(torch.tensor(next_grad))
              torch_grad = input_var.grad.detach().numpy()
              self.assertNumpyClose(custom_grad, torch_grad, atol=1e-6, rtol=1e-5, msg=f"AvgPool2d gradInput {params}")

    def test_GlobalMaxPool2d(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, channels, height, width = 4, 3, 8, 6
        for _ in range(10):
          custom_module = GlobalMaxPool2d()
          torch_module = torch.nn.AdaptiveMaxPool2d((1, 1))
          input_np = np.random.randn(batch_size, channels, height, width).astype(np.float32)
          input_var = torch.tensor(input_np, requires_grad=True)

          custom_output = custom_module.forward(input_np)
          torch_output = torch_module(input_var)
          torch_output_squeezed = torch_output.squeeze(dim=(2, 3))
          self.assertNumpyClose(custom_output, torch_output_squeezed.detach().numpy(), atol=1e-7, msg="GlobalMaxPool2d forward")

          next_grad_custom = np.random.randn(*custom_output.shape).astype(np.float32)
          next_grad_torch = torch.tensor(next_grad_custom.reshape(batch_size, channels, 1, 1))
          custom_grad = custom_module.backward(input_np, next_grad_custom)
          torch_output.backward(next_grad_torch)
          torch_grad = input_var.grad.detach().numpy()
          self.assertNumpyClose(custom_grad, torch_grad, atol=1e-6, rtol=1e-5, msg="GlobalMaxPool2d gradInput")

    def test_GlobalAvgPool2d(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, channels, height, width = 4, 3, 8, 6
        for _ in range(10):
          custom_module = GlobalAvgPool2d()
          torch_module = torch.nn.AdaptiveAvgPool2d((1, 1))
          input_np = np.random.randn(batch_size, channels, height, width).astype(np.float32)
          input_var = torch.tensor(input_np, requires_grad=True)

          custom_output = custom_module.forward(input_np)
          torch_output = torch_module(input_var)
          torch_output_squeezed = torch_output.squeeze(dim=(2, 3))
          self.assertNumpyClose(custom_output, torch_output_squeezed.detach().numpy(), atol=1e-7, msg="GlobalAvgPool2d forward")

          next_grad_custom = np.random.randn(*custom_output.shape).astype(np.float32)
          next_grad_torch = torch.tensor(next_grad_custom.reshape(batch_size, channels, 1, 1))
          custom_grad = custom_module.backward(input_np, next_grad_custom)
          torch_output.backward(next_grad_torch)
          torch_grad = input_var.grad.detach().numpy()
          self.assertNumpyClose(custom_grad, torch_grad, atol=1e-6, rtol=1e-5, msg="GlobalAvgPool2d gradInput")

    def test_Flatten(self):
        np.random.seed(42)
        torch.manual_seed(42)
        custom_module = Flatten()
        torch_module = torch.nn.Flatten()
        input_shapes = [(2, 3, 4, 5), (10, 1), (5, 2, 1, 3)]
        for shape in input_shapes:
          with self.subTest(shape=shape):
              input_np = np.random.randn(*shape).astype(np.float32)
              input_var = torch.tensor(input_np, requires_grad=True)

              custom_output = custom_module.forward(input_np)
              torch_output = torch_module(input_var)
              self.assertNumpyClose(custom_output, torch_output.detach().numpy(), msg=f"Flatten forward {shape}")

              next_grad = np.random.randn(*custom_output.shape).astype(np.float32)
              custom_grad = custom_module.backward(input_np, next_grad)
              torch_output.backward(torch.tensor(next_grad))
              torch_grad = input_var.grad.detach().numpy()
              self.assertNumpyClose(custom_grad, torch_grad, msg=f"Flatten gradInput {shape}")

    def test_Gelu(self):
        np.random.seed(42)
        torch.manual_seed(42)
        batch_size, n_in = 10, 5
        for _ in range(100):
          custom_module = Gelu()
          torch_module = torch.nn.GELU(approximate='none')
          input_np = np.random.randn(batch_size, n_in).astype(np.float32)
          input_var = torch.tensor(input_np, requires_grad=True)

          custom_output = custom_module.forward(input_np)
          torch_output = torch_module(input_var)
          self.assertNumpyClose(custom_output, torch_output.detach().numpy(), atol=1e-6, msg="Gelu forward") # Увеличена atol

          next_grad = np.random.randn(*custom_output.shape).astype(np.float32)
          custom_grad = custom_module.backward(input_np, next_grad)
          torch_output.backward(torch.tensor(next_grad))
          torch_grad = input_var.grad.detach().numpy()
          self.assertNumpyClose(custom_grad, torch_grad, atol=1e-6, rtol=1e-5, msg="Gelu gradInput")


# Запуск тестов
if __name__ == '__main__':
    suite = unittest.TestLoader().loadTestsFromTestCase(TestLayers)
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)

    # print("\n--- Test Summary ---")
    # print(f"Ran: {result.testsRun} tests")
    # print(f"Result: {'OK' if result.wasSuccessful() else 'FAILED'}")

test_AvgPool2d (__main__.TestLayers.test_AvgPool2d) ... ok
test_BatchNormalization (__main__.TestLayers.test_BatchNormalization) ... ok
test_ClassNLLCriterion (__main__.TestLayers.test_ClassNLLCriterion) ... ok
test_ClassNLLCriterionUnstable (__main__.TestLayers.test_ClassNLLCriterionUnstable) ... ok
test_Conv2d (__main__.TestLayers.test_Conv2d) ... ok
test_Dropout (__main__.TestLayers.test_Dropout) ... ok
test_ELU (__main__.TestLayers.test_ELU) ... ok
test_Flatten (__main__.TestLayers.test_Flatten) ... ok
test_Gelu (__main__.TestLayers.test_Gelu) ... ok
test_GlobalAvgPool2d (__main__.TestLayers.test_GlobalAvgPool2d) ... ok
test_GlobalMaxPool2d (__main__.TestLayers.test_GlobalMaxPool2d) ... ok
test_LeakyReLU (__main__.TestLayers.test_LeakyReLU) ... ok
test_Linear (__main__.TestLayers.test_Linear) ... ok
test_LogSoftMax (__main__.TestLayers.test_LogSoftMax) ... ok
test_MaxPool2d (__main__.TestLayers.test_MaxPool2d) ... ok
test_Sequential_BatchNormAffine (__main__.TestLayers.test_Sequent