In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
# DCGAN Implementation
# Discriminator
class Discriminator(nn.Module):
    def __init__(self, img_channels, features_d):
        super(Discriminator, self).__init__()
        
        self.disc = nn.Sequential(
            # input: N x img_channels x 64 x 64
            nn.Conv2d(img_channels, features_d, 4, 2, 1), # 32x32
            nn.LeakyReLU(0.2),
            self._block(features_d, features_d*2, 4, 2, 1), # 16x16
            self._block(features_d*2, features_d*4, 4, 2, 1), # 8x8
            self._block(features_d*4,features_d*8,4,2,1), # 4x4
            nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
            nn.Sigmoid(),
        )   

    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.2),
        )
    
    def forward(self, x):
        return self.disc(x)


#### Explaining the Discriminator

input: N x img_channels x 64 x 64
why 64x64? because we are using celebA dataset which is 64x64 and 3 channels (RGB), hence img_channels = 3, N is the batch size

output: N x 1 x 1 x 1
why 1x1x1? because we are using Conv2d with kernel_size=4, stride=2, padding=1, hence the image size is reduced by 2 in each dimension
hence 64 -> 32 -> 16 -> 8 -> 4 -> 2 -> 1
hence the output is N x 1 x 1 x 1
This is used to classify whether the input image is real or fake

As per the DCGAN paper, we use LeakyReLU with slope 0.2 and we use BatchNorm2d after each Conv2d layer except the last one

In [None]:
# Generator

class Generator(nn.Module):
    def __init__(self, z_dim, img_channels, features_g):
        super(Generator, self).__init__()

        self.gen = nn.Sequential(
            # Input: N x channels_noise x 1 x 1
            self._block(z_dim, features_g * 16, 4, 1, 0),  # img: 4x4
            self._block(features_g * 16, features_g * 8, 4, 2, 1),  # img: 8x8
            self._block(features_g * 8, features_g * 4, 4, 2, 1),  # img: 16x16
            self._block(features_g * 4, features_g * 2, 4, 2, 1),  # img: 32x32
            nn.ConvTranspose2d(
                features_g * 2, img_channels, kernel_size=4, stride=2, padding=1
            ),
            # Output: N x channels_img x 64 x 64
            nn.Tanh(),
        )

    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
        return nn.Sequential(
            nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride, padding),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
    
    def forward(self,x):
        return self.gen(x)


#### Explaining the Generator

input: N x z_dim
output: N x img_channels x 64 x 64

We start with a linear layer to convert the input noise vector of size z_dim to a tensor of size 4x4xfeatures_g*16 (4x4 image with features_g*16 channels)
Then we use ConvTranspose2d to increase the image size by 2 in each dimension
We use BatchNorm2d after each ConvTranspose2d layer
We use ReLU activation function after each BatchNorm2d layer except the last one
The final layer uses ConvTranspose2d with kernel_size=4, stride=2, padding=1 to generate an image of size 64x64
In this case we use Tanh activation function so that the pixel values are in the range of -1 to 1

z_dim is the input noise vector size

What is the difference between Conv2d and ConvTranspose2d?
Conv2d is used to reduce the image size (downsampling) and ConvTranspose2d is used to increase the image size (upsampling)

What is the difference between BatchNorm1d and BatchNorm2d?
BatchNorm1d is used for 1D data like time series data and BatchNorm2d is used for 2D data like images

What is the difference between ReLU and LeakyReLU?
ReLU is Rectified Linear Unit which is max(0,x) and LeakyReLU is max(0.01x,x) or max(0.2x,x) where x is the input to the activation function (output of the linear layer)

In [None]:
def init_weights(model):
    for m in model.modules():
        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
            nn.init.normal_(m.weight.data, 0.0, 0.02)

In [None]:
# Hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 2e-4
img_channels = 1
img_size = 64
z_dim = 100
features_d = 64
features_g = 64
epochs = 5

In [None]:
# Dataset download and preprocessing

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision import datasets

# data = datasets.CIFAR10(root='data/', train=True, download=True, transform=transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)),
# ]))

digits_data = datasets.MNIST(root='data/', train=True, download=True, transform=transforms.Compose([
    transforms.Resize(img_size),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
]))

# We use CIFAR10 dataset for this implementation
# The images are of size 32x32 and 3 channels (RGB)
# We normalize the pixel values to be in the range of -1 to 1

dataloader = DataLoader(digits_data, batch_size=128, shuffle=True)

In [None]:
# Initialize the Discriminator and Generator
disc = Discriminator(img_channels, features_d).to(device)
gen = Generator(z_dim, img_channels, features_g).to(device)

# Weight initialization
init_weights(disc)
init_weights(gen)

# Initialize the optimizers
opt_disc = optim.Adam(disc.parameters(), lr=lr, betas=(0.5, 0.999))
opt_gen = optim.Adam(gen.parameters(), lr=lr, betas=(0.5, 0.999))

# Loss
loss_fn = nn.BCELoss()

1. What is the role of weight initialization?<br>
    Weight initialization is important because it helps in training the model faster and more effectively by preventing the gradients from vanishing or exploding during backpropagation, this means that the model can learn the patterns in the data more effectively
<br>

2. What is this rule of mean 0 and standard deviation 1?<br>
    The "rule" of initializing weights with a mean of 0 and a standard deviation of 1 is a common practice in neural network weight initialization, particularly for certain types of layers like fully connected (dense) layers. This rule is also known as "Xavier initialization" or "Glorot initialization," named after the researchers who introduced it.

    Here's an explanation of why this rule is used and its significance:

    1. **Stabilizing Gradients**: Initializing weights with a mean of 0 helps ensure that the initial outputs of neurons are centered around 0. This can help stabilize the gradients during backpropagation, preventing them from exploding or vanishing as they propagate through the network layers.

    2. **Balancing Signal Propagation**: By setting the standard deviation to 1, the weights are initially scaled to have a moderate range of values. This helps balance the magnitude of the signal propagated forward through the network, preventing it from becoming too large or too small as it passes through multiple layers.

    3. **Improving Training Dynamics**: Proper weight initialization can lead to more stable and efficient training dynamics. It can help networks converge faster and achieve better generalization performance by ensuring that the weights are initialized in a way that facilitates effective learning.

    4. **Applicability to Different Layers**: While initializing weights with mean 0 and standard deviation 1 is commonly used for fully connected layers, variations of this rule exist for other types of layers, such as convolutional and recurrent layers. For example, Xavier initialization adapts the initialization scheme based on the number of input and output units of a layer to account for differences in signal propagation.

    Overall, initializing weights with a mean of 0 and a standard deviation of 1 is a widely adopted practice in neural network initialization because it helps address common issues related to gradient stability and signal propagation during training, contributing to more effective and efficient learning.
<br>

3. Why are we using standard deviation 0.02 for weight initialization?<br>
    This is a common practice in GANs to prevent the generator from collapsing all the generated images to a single point in the image space (mode collapse) and this is recommended in the DCGAN paper

4. But why is Xaviers initialization not used in GANs?<br>
    Xavier initialization is not used in GANs because it is designed for feedforward neural networks and may not be optimal for GANs, which have a different architecture and training dynamics. In GANs, the goal is to train a generator and a discriminator simultaneously, and the training process involves a minimax game between the two networks. The dynamics of GAN training, such as the competition between the generator and discriminator, can lead to different requirements for weight initialization compared to standard feedforward networks.

    In GANs, the choice of weight initialization can have a significant impact on the training dynamics and the quality of the generated samples. For example, using Xavier initialization in GANs may lead to issues such as mode collapse, where the generator produces limited and repetitive samples, or training instability, where the networks fail to converge to a stable equilibrium.

    Instead of Xavier initialization, GANs often use different weight initialization strategies that are tailored to the specific requirements of GAN training. For example, in DCGAN (Deep Convolutional GAN), a popular architecture for image generation, the weights of the generator and discriminator are initialized with a mean of 0 and a standard deviation of 0.02. This initialization scheme is chosen to balance the signal propagation and prevent mode collapse, helping the networks learn diverse and realistic image distributions.

    In summary, while Xavier initialization is a common practice for feedforward neural networks, it may not be suitable for GANs due to their unique architecture and training dynamics. GANs often require specialized weight initialization strategies to address challenges such as mode collapse and training instability, leading to improved performance and sample quality.

5. Then what is Kaiming initialization?<br>
    Kaiming initialization, also known as He initialization, is a weight initialization technique that is specifically designed for deep neural networks with rectified linear units (ReLUs) as activation functions. It addresses the issue of vanishing or exploding gradients that can occur during training of deep networks by initializing the weights in a way that helps stabilize the gradients and improve learning.

    Here are some key points about Kaiming initialization:

    1. **Designed for ReLU Activations**: Kaiming initialization is tailored for networks that use ReLU activation functions, which are commonly used in deep learning models due to their ability to mitigate the vanishing gradient problem. By initializing the weights in a way that accounts for the characteristics of ReLU activations, Kaiming initialization helps ensure that the gradients remain stable during training.

    2. **Mean and Variance Adjustment**: Kaiming initialization sets the mean of the weights to 0 and adjusts the variance based on the type of activation function used. For ReLU activations, the variance is scaled by a factor that depends on the number of input units to the layer, helping to balance the signal propagation and prevent the gradients from vanishing or exploding.

    3. **Improves Training Dynamics**: By initializing the weights in a way that is tailored to ReLU activations, Kaiming initialization can improve the training dynamics of deep networks. It helps networks converge faster, learn more effectively, and achieve better generalization performance by ensuring that the gradients are stable and the weights are initialized in a way that facilitates learning.

    4. **Applicability to Deep Networks**: Kaiming initialization is particularly well-suited for deep neural networks with many layers, where the vanishing gradient problem can be a significant challenge. By providing a principled way to initialize the weights based on the characteristics of ReLU activations, Kaiming initialization helps deep networks train more effectively and achieve better performance.

    In summary, Kaiming initialization is a weight initialization technique that is specifically designed for deep neural networks with ReLU activations. By setting the mean of the weights to 0 and adjusting the variance based on the type of activation function, Kaiming initialization helps stabilize the gradients and improve the training dynamics of deep networks, leading to more effective learning and better performance.

In [None]:
loss_values = {
    'disc': [],
    'gen': []
}

# Set the models to training mode
disc.train()
gen.train()

# Training loop
for epoch in range(epochs):
    for idx, (real, _) in enumerate(dataloader):
        real = real.to(device)
        batch_size = real.size(0)  # Get the batch size

        # Generate random noise
        noise = torch.randn(batch_size, z_dim, 1, 1).to(device)

        # Generate fake images
        fake = gen(noise)

        # Discriminator forward pass
        disc_real = disc(real).view(-1)
        disc_fake = disc(fake.detach()).view(-1)

        # Discriminator loss
        lossD_real = loss_fn(disc_real, torch.ones_like(disc_real)) # Real images are labeled as 1, so (predicted, target)
        lossD_fake = loss_fn(disc_fake, torch.zeros_like(disc_fake)) # Fake images are labeled as 0, so (predicted, target)
        lossD = (lossD_real + lossD_fake) / 2 # Average loss

        # Backpropagation for discriminator
        disc.zero_grad()
        lossD.backward()
        opt_disc.step()

        # Generator forward pass (using updated discriminator)
        output = disc(fake).view(-1)

        # Generator loss
        lossG = loss_fn(output, torch.ones_like(output))

        # In this case we real images are labeled as 1 and fake images as 0

        # Backpropagation for generator
        gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        # Save losses for visualization
        loss_values['disc'].append(lossD.item())
        loss_values['gen'].append(lossG.item())

        if idx % 100 == 0:
            print(f"Epoch [{epoch}/{epochs}] Batch {idx}/{len(dataloader)} Loss D: {lossD:.4f}, Loss G: {lossG:.4f}")

# Save the model
torch.save(gen.state_dict(), 'gen.pth')
torch.save(disc.state_dict(), 'disc.pth')


In [None]:
# Plot the loss values
plt.figure()
plt.plot(loss_values['disc'], label='Discriminator')
plt.plot(loss_values['gen'], label='Generator')
plt.legend()
plt.show()

In [None]:
# Generate Images 9 images with 3x3 grid
import matplotlib.pyplot as plt

gen = Generator(z_dim, img_channels, features_g).to(device)
gen.load_state_dict(torch.load('gen.pth'))
gen.eval()

# Generate random noise
noise = torch.randn(9, z_dim, 1, 1).to(device)

# Generate images
with torch.inference_mode():
    gen_images = gen(noise)
    # Display images
    plt.figure(figsize=(3,3))
    for i in range(gen_images.shape[0]):
        plt.subplot(3,3,i+1)
        img = gen_images[i].detach().cpu().numpy().transpose(1,2,0)
        img = (img + 1) / 2
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout()
    plt.show()
