**Forward Propagation**

In [None]:
%%writefile forward_prop.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>

// Define the activation function (sigmoid in this case)
__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

// Kernel for the linear combination and activation
__global__ void linear_layer_and_activation(float *input, float *weights, float *biases, float *output, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float z = 0.0f;
        for (int i = 0; i < input_size; ++i) {
            z +=  input[i] * weights[idx * input_size + i];
        }
        z += biases[idx];
        output[idx] = sigmoid(z);
    }
}

int main() {
    // Define the architecture
    const int input_size = 2;
    const int hidden_size = 2;
    const int output_size = 1;

    // Initialize input data, weights and biases
    float host_input[input_size] = {9.0f, 9.0f}; // Example input
    float host_hidden_weights[input_size * hidden_size] ={0.15f, 0.25f, 0.20f, 0.30f};
    float host_hidden_biases[hidden_size] = {0.35f, 0.35f};
    float host_output_weights[hidden_size * output_size] = {0.4f, 0.5f};
    float host_output_biases[output_size] = {0.6f};

    // Allocate memory on the device
    float *d_input, *d_hidden_weights, *d_hidden_biases, *d_hidden_output;
    float *d_output_weights, *d_output_biases, *d_output_output;
    cudaMalloc((void**)&d_input, input_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_weights, input_size * hidden_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_biases, hidden_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_output, hidden_size * sizeof(float));
    cudaMalloc((void**)&d_output_weights, hidden_size * output_size * sizeof(float));
    cudaMalloc((void**)&d_output_biases, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_output, output_size * sizeof(float));

    // Copy data to the device
    cudaMemcpy(d_input, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_hidden_weights, host_hidden_weights, input_size * hidden_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_hidden_biases, host_hidden_biases, hidden_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output_weights, host_output_weights, hidden_size * output_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output_biases, host_output_biases, output_size * sizeof(float), cudaMemcpyHostToDevice);

    // Launch the kernel for the hidden layer
    linear_layer_and_activation<<<1, hidden_size>>>(d_input, d_hidden_weights, d_hidden_biases, d_hidden_output, input_size, hidden_size);
    cudaDeviceSynchronize(); // Ensure the hidden layer computation is complete

    // Launch the kernel for the output layer
    linear_layer_and_activation<<<1, output_size>>>(d_hidden_output, d_output_weights, d_output_biases, d_output_output, hidden_size, output_size);
    cudaDeviceSynchronize(); // Ensure the output layer computation is complete

    // Copy the result back to the host
    float host_output[output_size];
    cudaMemcpy(host_output, d_output_output, output_size * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the result
    std::cout << "Output: " << host_output[0] << std::endl;

    // Free device memory
    cudaFree(d_input);
    cudaFree(d_hidden_weights);
    cudaFree(d_hidden_biases);
    cudaFree(d_hidden_output);
    cudaFree(d_output_weights);
    cudaFree(d_output_biases);
    cudaFree(d_output_output);

    return 0;
}


Writing forward_prop.cu


In [None]:
!nvcc forward_prop.cu -o forward_prop
!./forward_prop

Output: 0.815862


**Back Propagation**

In [None]:
%%writefile l.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>

// Define the activation function (sigmoid in this case)
__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

// Define the derivative of the sigmoid function
__device__ float sigmoid_derivative(float x) {
    return x * (1.0f - x);
}

// Kernel for the linear combination and activation
__global__ void linear_layer_and_activation(float *input, float *weights, float *biases, float *output, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float z = 0.0f;
        for (int i = 0; i < input_size; ++i) {
            z += weights[idx * input_size + i] * input[i];
        }
        z += biases[idx];
        output[idx] = sigmoid(z);
    }
}

// Kernel for computing the output layer delta using cross-entropy loss
__global__ void compute_output_delta(float *output, float *target, float *delta, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float p = output[idx];
        float y = target[idx];
        delta[idx] = p - y;  // Derivative of cross-entropy loss with respect to the output
    }
}

// Kernel for computing the hidden layer delta
__global__ void compute_hidden_delta(float *output_delta, float *weights, float *hidden_output, float *hidden_delta, int output_size, int hidden_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < hidden_size) {
        float error = 0.0f;
        for (int i = 0; i < output_size; ++i) {
            error += output_delta[i] * weights[i * hidden_size + idx];
        }
        hidden_delta[idx] = error * sigmoid_derivative(hidden_output[idx]);
    }
}

// Kernel for updating the weights and biases
__global__ void update_weights_and_biases(float *weights, float *biases, float *delta, float *input, int input_size, int output_size, float learning_rate) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        for (int i = 0; i < input_size; ++i) {
            weights[idx * input_size + i] -= learning_rate * delta[idx] * input[i];
        }
        biases[idx] -= learning_rate * delta[idx];
    }
}

// Kernel for computing the cross-entropy loss
__global__ void compute_loss(float *output, float *target, float *loss, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float p = output[idx];
        float y = target[idx];
        float epsilon = 1e-10f; // To avoid log(0)
        float log10e = 2.302585092994046f; // Precomputed log_e(10) for base 10 conversion
        loss[idx] = - (y * logf(p + epsilon) / log10e + (1.0f - y) * logf(1.0f - p + epsilon) / log10e); // Cross-entropy loss
    }
}

int main() {
    // Define the architecture
    const int input_size = 2;    // Number of inputs
    const int hidden_size = 2;   // Number of neurons in the hidden layer
    const int output_size = 1;   // Number of outputs

    // Initialize input data, weights, biases, and target output
    float host_input[input_size] = {9.0f, 9.0f}; // Example input
    float host_target[output_size] = {1.0f}; // Target output for training

    // Initialize weights and biases with hard-coded values
    float host_hidden_weights[input_size * hidden_size] = {0.15f, 0.25f, 0.20f, 0.30f}; // Size = input_size * hidden_size
    float host_hidden_biases[hidden_size] = {0.35f, 0.35f}; // Size = hidden_size
    float host_output_weights[hidden_size * output_size] = {0.40f, 0.50f}; // Size = hidden_size * output_size
    float host_output_biases[output_size] = {0.60f}; // Size = output_size

    // Training parameters
    const int epochs = 100;
    const float learning_rate = 0.01f;

    // Allocate memory on the device
    float *d_input, *d_hidden_weights, *d_hidden_biases, *d_hidden_output;
    float *d_output_weights, *d_output_biases, *d_output_output;
    float *d_output_delta, *d_hidden_delta, *d_target, *d_loss;
    cudaMalloc((void**)&d_input, input_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_weights, input_size * hidden_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_biases, hidden_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_output, hidden_size * sizeof(float));
    cudaMalloc((void**)&d_output_weights, hidden_size * output_size * sizeof(float));
    cudaMalloc((void**)&d_output_biases, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_output, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_delta, output_size * sizeof(float));
    cudaMalloc((void**)&d_hidden_delta, hidden_size * sizeof(float));
    cudaMalloc((void**)&d_target, output_size * sizeof(float));
    cudaMalloc((void**)&d_loss, output_size * sizeof(float));

    // Copy data to the device
    cudaMemcpy(d_input, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_target, host_target, output_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_hidden_weights, host_hidden_weights, input_size * hidden_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_hidden_biases, host_hidden_biases, hidden_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output_weights, host_output_weights, hidden_size * output_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output_biases, host_output_biases, output_size * sizeof(float), cudaMemcpyHostToDevice);

    // Training loop
    for (int epoch = 0; epoch < epochs; ++epoch) {
        // Forward pass
        linear_layer_and_activation<<<1, hidden_size>>>(d_input, d_hidden_weights, d_hidden_biases, d_hidden_output, input_size, hidden_size);
        cudaDeviceSynchronize(); // Ensure the hidden layer computation is complete

        linear_layer_and_activation<<<1, output_size>>>(d_hidden_output, d_output_weights, d_output_biases, d_output_output, hidden_size, output_size);
        cudaDeviceSynchronize(); // Ensure the output layer computation is complete

        // Compute loss using current weights and biases
        compute_loss<<<1, output_size>>>(d_output_output, d_target, d_loss, output_size);
        cudaDeviceSynchronize(); // Ensure the loss computation is complete

        float host_loss[output_size];
        cudaMemcpy(host_loss, d_loss, output_size * sizeof(float), cudaMemcpyDeviceToHost);
        float total_loss = 0.0f;
        for (int i = 0; i < output_size; ++i) {
            total_loss += host_loss[i];
        }


        // Compute deltas
        compute_output_delta<<<1, output_size>>>(d_output_output, d_target, d_output_delta, output_size);
        cudaDeviceSynchronize();

        compute_hidden_delta<<<1, hidden_size>>>(d_output_delta, d_output_weights, d_hidden_output, d_hidden_delta, output_size, hidden_size);
        cudaDeviceSynchronize();

        // Update weights and biases
        update_weights_and_biases<<<1, output_size>>>(d_output_weights, d_output_biases, d_output_delta, d_hidden_output, hidden_size, output_size, learning_rate);
        cudaDeviceSynchronize();

        update_weights_and_biases<<<1, hidden_size>>>(d_hidden_weights, d_hidden_biases, d_hidden_delta, d_input, input_size, hidden_size, learning_rate);
        cudaDeviceSynchronize();

        // Recompute forward pass with updated weights and biases
        linear_layer_and_activation<<<1, hidden_size>>>(d_input, d_hidden_weights, d_hidden_biases, d_hidden_output, input_size, hidden_size);
        cudaDeviceSynchronize();

        linear_layer_and_activation<<<1, output_size>>>(d_hidden_output, d_output_weights, d_output_biases, d_output_output, hidden_size, output_size);
        cudaDeviceSynchronize();

        // Recompute loss with updated weights and biases
        compute_loss<<<1, output_size>>>(d_output_output, d_target, d_loss, output_size);
        cudaDeviceSynchronize(); // Ensure the loss computation is complete

        cudaMemcpy(host_loss, d_loss, output_size * sizeof(float), cudaMemcpyDeviceToHost);
        total_loss = 0.0f;
        for (int i = 0; i < output_size; ++i) {
            total_loss += host_loss[i];
        }

        // Print the new loss after weight and bias update
        std::cout << "Epoch " << epoch + 1 << " -  Loss: " << total_loss << std::endl;
    }

    // Free device memory
    cudaFree(d_input);
    cudaFree(d_hidden_weights);
    cudaFree(d_hidden_biases);
    cudaFree(d_hidden_output);
    cudaFree(d_output_weights);
    cudaFree(d_output_biases);
    cudaFree(d_output_output);
    cudaFree(d_output_delta);
    cudaFree(d_hidden_delta);
    cudaFree(d_target);
    cudaFree(d_loss);
    return 0;
}

Overwriting l.cu


In [None]:
!nvcc l.cu -o l
!./l

Epoch 1 -  Loss: 0.0879486
Epoch 2 -  Loss: 0.0875177
Epoch 3 -  Loss: 0.0870906
Epoch 4 -  Loss: 0.0866673
Epoch 5 -  Loss: 0.0862476
Epoch 6 -  Loss: 0.0858316
Epoch 7 -  Loss: 0.0854191
Epoch 8 -  Loss: 0.0850103
Epoch 9 -  Loss: 0.0846049
Epoch 10 -  Loss: 0.084203
Epoch 11 -  Loss: 0.0838046
Epoch 12 -  Loss: 0.0834095
Epoch 13 -  Loss: 0.0830177
Epoch 14 -  Loss: 0.0826293
Epoch 15 -  Loss: 0.0822442
Epoch 16 -  Loss: 0.0818622
Epoch 17 -  Loss: 0.0814836
Epoch 18 -  Loss: 0.081108
Epoch 19 -  Loss: 0.0807356
Epoch 20 -  Loss: 0.0803663
Epoch 21 -  Loss: 0.0799999
Epoch 22 -  Loss: 0.0796367
Epoch 23 -  Loss: 0.0792764
Epoch 24 -  Loss: 0.078919
Epoch 25 -  Loss: 0.0785646
Epoch 26 -  Loss: 0.078213
Epoch 27 -  Loss: 0.0778643
Epoch 28 -  Loss: 0.0775184
Epoch 29 -  Loss: 0.0771753
Epoch 30 -  Loss: 0.0768349
Epoch 31 -  Loss: 0.0764973
Epoch 32 -  Loss: 0.0761624
Epoch 33 -  Loss: 0.0758301
Epoch 34 -  Loss: 0.0755004
Epoch 35 -  Loss: 0.0751734
Epoch 36 -  Loss: 0.0748489
Epoch

**Generic**

**Back Propagation Generic**

In [None]:
%%writefile values.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>

#define MAX_LAYERS 10 // Define the maximum number of layers you want to support

// Define the activation function (sigmoid in this case)
__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

// Define the derivative of the sigmoid function
__device__ float sigmoid_derivative(float x) {
    return x * (1.0f - x);
}

// Kernel for the linear combination and activation for any layer
__global__ void linear_layer_and_activation(float *input, float *weights, float *biases, float *output, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float z = 0.0f;
        for (int i = 0; i < input_size; ++i) {
            z += weights[idx * input_size + i] * input[i];
        }
        z += biases[idx];
        output[idx] = sigmoid(z);
    }
}

// Kernel for computing the output layer delta using cross-entropy loss
__global__ void compute_output_delta(float *output, float *target, float *delta, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float p = output[idx];
        float y = target[idx];
        delta[idx] = p - y;  // Derivative of cross-entropy loss with respect to the output
    }
}

// Kernel for computing the hidden layer delta
__global__ void compute_hidden_delta(float *output_delta, float *weights, float *hidden_output, float *hidden_delta, int output_size, int hidden_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < hidden_size) {
        float error = 0.0f;
        for (int i = 0; i < output_size; ++i) {
            error += output_delta[i] * weights[i * hidden_size + idx];
        }
        hidden_delta[idx] = error * sigmoid_derivative(hidden_output[idx]);
    }
}

// Kernel for updating the weights and biases
__global__ void update_weights_and_biases(float *weights, float *biases, float *delta, float *input, int input_size, int output_size, float learning_rate) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        for (int i = 0; i < input_size; ++i) {
            weights[idx * input_size + i] -= learning_rate * delta[idx] * input[i];
        }
        biases[idx] -= learning_rate * delta[idx];
    }
}

// Kernel for computing the cross-entropy loss
__global__ void compute_loss(float *output, float *target, float *loss, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float p = output[idx];
        float y = target[idx];
        float epsilon = 1e-10f; // To avoid log(0)
        float log10e = 2.302585092994046f; // Precomputed log_e(10) for base 10 conversion
        loss[idx] = - (y * logf(p + epsilon) / log10e + (1.0f - y) * logf(1.0f - p + epsilon) / log10e); // Cross-entropy loss
    }
}

int main() {
    // Define the architecture
    const int input_size = 2;    // Number of inputs
    const int hidden_sizes[MAX_LAYERS] = {2};   // Number of neurons in each hidden layer
    const int num_hidden_layers = 1; // Number of hidden layers
    const int output_size = 1;   // Number of outputs

    // Initialize input data, weights, biases, and target output
    float host_input[input_size] = {9.0f, 9.0f}; // Example input
    float host_target[output_size] = {1.0f}; // Target output for training

    // Initialize weights and biases with hard-coded values
    float host_weights[MAX_LAYERS][input_size * hidden_sizes[0]] = { {0.15f, 0.25f, 0.20f, 0.30f} }; // Example
    float host_biases[MAX_LAYERS][hidden_sizes[0]] = { {0.35f, 0.35f} }; // Example
    float host_output_weights[hidden_sizes[0] * output_size] = {0.40f, 0.50f}; // Example
    float host_output_biases[output_size] = {0.60f}; // Example

    // Training parameters
    const int epochs = 100;
    const float learning_rate = 0.01f;

    // Allocate memory on the device
    float *d_input, *d_weights[MAX_LAYERS], *d_biases[MAX_LAYERS], *d_hidden_output[MAX_LAYERS], *d_output_weights, *d_output_biases, *d_output_output;
    float *d_output_delta, *d_hidden_delta[MAX_LAYERS], *d_target, *d_loss;

    cudaMalloc((void**)&d_input, input_size * sizeof(float));
    cudaMalloc((void**)&d_target, output_size * sizeof(float));

    // Allocate memory for weights and biases of hidden layers
    for (int l = 0; l < num_hidden_layers; ++l) {
        cudaMalloc((void**)&d_weights[l], (l == 0 ? input_size : hidden_sizes[l-1]) * hidden_sizes[l] * sizeof(float));
        cudaMalloc((void**)&d_biases[l], hidden_sizes[l] * sizeof(float));
        cudaMalloc((void**)&d_hidden_output[l], hidden_sizes[l] * sizeof(float));
        cudaMalloc((void**)&d_hidden_delta[l], hidden_sizes[l] * sizeof(float));

        cudaMemcpy(d_weights[l], host_weights[l], (l == 0 ? input_size : hidden_sizes[l-1]) * hidden_sizes[l] * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_biases[l], host_biases[l], hidden_sizes[l] * sizeof(float), cudaMemcpyHostToDevice);
    }
    cudaMalloc((void**)&d_output_weights, hidden_sizes[num_hidden_layers-1] * output_size * sizeof(float));
    cudaMalloc((void**)&d_output_biases, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_output, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_delta, output_size * sizeof(float));
    cudaMalloc((void**)&d_loss, output_size * sizeof(float));

    cudaMemcpy(d_output_weights, host_output_weights, hidden_sizes[num_hidden_layers-1] * output_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output_biases, host_output_biases, output_size * sizeof(float), cudaMemcpyHostToDevice);

    // Copy input data and target output to device
    cudaMemcpy(d_input, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_target, host_target, output_size * sizeof(float), cudaMemcpyHostToDevice);

    // Training loop
    // Training loop
for (int epoch = 0; epoch < epochs+1; ++epoch) {
    // Forward pass
    float *d_input_current = d_input;
    for (int l = 0; l < num_hidden_layers; ++l) {
        linear_layer_and_activation<<<1, hidden_sizes[l]>>>(d_input_current, d_weights[l], d_biases[l], d_hidden_output[l], (l == 0 ? input_size : hidden_sizes[l-1]), hidden_sizes[l]);
        cudaDeviceSynchronize();
        d_input_current = d_hidden_output[l];
    }

    linear_layer_and_activation<<<1, output_size>>>(d_input_current, d_output_weights, d_output_biases, d_output_output, hidden_sizes[num_hidden_layers-1], output_size);
    cudaDeviceSynchronize();

    // Compute loss using current weights and biases
    compute_loss<<<1, output_size>>>(d_output_output, d_target, d_loss, output_size);
    cudaDeviceSynchronize();

    float host_loss[output_size];
    cudaMemcpy(host_loss, d_loss, output_size * sizeof(float), cudaMemcpyDeviceToHost);
    float total_loss = 0.0f;
    for (int i = 0; i < output_size; ++i) {
        total_loss += host_loss[i];
    }
    // Compute deltas
    compute_output_delta<<<1, output_size>>>(d_output_output, d_target, d_output_delta, output_size);
    cudaDeviceSynchronize();

    float *d_delta_current = d_output_delta;
    for (int l = num_hidden_layers-1; l >= 0; --l) {
        compute_hidden_delta<<<1, hidden_sizes[l]>>>(d_delta_current, d_weights[l], d_hidden_output[l], d_hidden_delta[l], hidden_sizes[l+1], hidden_sizes[l]);
        cudaDeviceSynchronize();
        d_delta_current = d_hidden_delta[l];
    }

    // Update weights and biases
    float *d_input_for_update = d_input;
    for (int l = 0; l < num_hidden_layers; ++l) {
        update_weights_and_biases<<<1, hidden_sizes[l]>>>(d_weights[l], d_biases[l], d_hidden_delta[l], d_input_for_update, (l == 0 ? input_size : hidden_sizes[l-1]), hidden_sizes[l], learning_rate);
        cudaDeviceSynchronize();
        d_input_for_update = d_hidden_output[l];
    }

    update_weights_and_biases<<<1, output_size>>>(d_output_weights, d_output_biases, d_output_delta, d_hidden_output[num_hidden_layers-1], hidden_sizes[num_hidden_layers-1], output_size, learning_rate);
    cudaDeviceSynchronize();

    // Only print loss from epoch 1 onwards
    if (epoch > 0) {
        std::cout << "Epoch " << epoch << " Loss: " << total_loss << std::endl;
    }
}
    // Clean up
    cudaFree(d_input);
    cudaFree(d_target);
    for (int l = 0; l < num_hidden_layers; ++l) {
        cudaFree(d_weights[l]);
        cudaFree(d_biases[l]);
        cudaFree(d_hidden_output[l]);
        cudaFree(d_hidden_delta[l]);
    }
    cudaFree(d_output_weights);
    cudaFree(d_output_biases);
    cudaFree(d_output_output);
    cudaFree(d_output_delta);
    cudaFree(d_loss);
    return 0;
}

Writing values.cu


In [None]:
!nvcc values.cu -o values
!./values

Epoch 1 Loss: 0.0879503
Epoch 2 Loss: 0.0875211
Epoch 3 Loss: 0.0870956
Epoch 4 Loss: 0.0866739
Epoch 5 Loss: 0.0862559
Epoch 6 Loss: 0.0858415
Epoch 7 Loss: 0.0854307
Epoch 8 Loss: 0.0850234
Epoch 9 Loss: 0.0846197
Epoch 10 Loss: 0.0842194
Epoch 11 Loss: 0.0838225
Epoch 12 Loss: 0.0834291
Epoch 13 Loss: 0.0830389
Epoch 14 Loss: 0.082652
Epoch 15 Loss: 0.0822685
Epoch 16 Loss: 0.0818881
Epoch 17 Loss: 0.081511
Epoch 18 Loss: 0.0811369
Epoch 19 Loss: 0.080766
Epoch 20 Loss: 0.0803982
Epoch 21 Loss: 0.0800334
Epoch 22 Loss: 0.0796716
Epoch 23 Loss: 0.0793128
Epoch 24 Loss: 0.0789569
Epoch 25 Loss: 0.0786039
Epoch 26 Loss: 0.0782539
Epoch 27 Loss: 0.0779066
Epoch 28 Loss: 0.0775621
Epoch 29 Loss: 0.0772204
Epoch 30 Loss: 0.0768815
Epoch 31 Loss: 0.0765452
Epoch 32 Loss: 0.0762117
Epoch 33 Loss: 0.0758808
Epoch 34 Loss: 0.0755526
Epoch 35 Loss: 0.0752268
Epoch 36 Loss: 0.0749037
Epoch 37 Loss: 0.0745832
Epoch 38 Loss: 0.074265
Epoch 39 Loss: 0.0739495
Epoch 40 Loss: 0.0736364
Epoch 41 Loss

**Back prop 2 hidden layers**

In [None]:
%%writefile back_prop_2.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>

#define MAX_LAYERS 10 // Define the maximum number of layers you want to support

// Define the activation function (sigmoid in this case)
__device__ float sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

// Define the derivative of the sigmoid function
__device__ float sigmoid_derivative(float x) {
    return x * (1.0f - x);
}

// Kernel for the linear combination and activation for any layer
__global__ void linear_layer_and_activation(float *input, float *weights, float *biases, float *output, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float z = 0.0f;
        for (int i = 0; i < input_size; ++i) {
            z += weights[idx * input_size + i] * input[i];
        }
        z += biases[idx];
        output[idx] = sigmoid(z);
    }
}

// Kernel for computing the output layer delta using cross-entropy loss
__global__ void compute_output_delta(float *output, float *target, float *delta, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float p = output[idx];
        float y = target[idx];
        delta[idx] = p - y;  // Derivative of cross-entropy loss with respect to the output
    }
}

// Kernel for computing the hidden layer delta
__global__ void compute_hidden_delta(float *output_delta, float *weights, float *hidden_output, float *hidden_delta, int output_size, int hidden_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < hidden_size) {
        float error = 0.0f;
        for (int i = 0; i < output_size; ++i) {
            error += output_delta[i] * weights[i * hidden_size + idx];
        }
        hidden_delta[idx] = error * sigmoid_derivative(hidden_output[idx]);
    }
}

// Kernel for updating the weights and biases
__global__ void update_weights_and_biases(float *weights, float *biases, float *delta, float *input, int input_size, int output_size, float learning_rate) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        for (int i = 0; i < input_size; ++i) {
            weights[idx * input_size + i] -= learning_rate * delta[idx] * input[i];
        }
        biases[idx] -= learning_rate * delta[idx];
    }
}

// Kernel for computing the cross-entropy loss
__global__ void compute_loss(float *output, float *target, float *loss, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < output_size) {
        float p = output[idx];
        float y = target[idx];
        float epsilon = 1e-10f; // To avoid log(0)
        float log10e = 2.302585092994046f; // Precomputed log_e(10) for base 10 conversion
        loss[idx] = - (y * logf(p + epsilon) / log10e + (1.0f - y) * logf(1.0f - p + epsilon) / log10e); // Cross-entropy loss
    }
}

int main() {
    // Define the architecture
    const int input_size = 2;    // Number of inputs
    const int hidden_sizes[MAX_LAYERS] = {2, 2};   // Number of neurons in each hidden layer (2 hidden layers with 2 neurons each)
    const int num_hidden_layers = 2; // Number of hidden layers
    const int output_size = 1;   // Number of outputs

    // Initialize input data, weights, biases, and target output
    float host_input[input_size] = {9.0f, 9.0f}; // Example input
    float host_target[output_size] = {1.0f}; // Target output for training

    // Initialize weights and biases for each hidden layer
    float host_weights[MAX_LAYERS][input_size * hidden_sizes[0]] = { {0.15f, 0.25f, 0.20f, 0.30f}, {0.40f, 0.45f, 0.50f, 0.55f} }; // Example
    float host_biases[MAX_LAYERS][hidden_sizes[0]] = { {0.35f, 0.35f}, {0.60f, 0.60f} }; // Example
    float host_output_weights[hidden_sizes[num_hidden_layers-1] * output_size] = {0.40f, 0.50f}; // Example
    float host_output_biases[output_size] = {0.60f}; // Example

    // Training parameters
    const int epochs = 100;
    const float learning_rate = 0.01f;

    // Allocate memory on the device
    float *d_input, *d_weights[MAX_LAYERS], *d_biases[MAX_LAYERS], *d_hidden_output[MAX_LAYERS], *d_output_weights, *d_output_biases, *d_output_output;
    float *d_output_delta, *d_hidden_delta[MAX_LAYERS], *d_target, *d_loss;

    cudaMalloc((void**)&d_input, input_size * sizeof(float));
    cudaMalloc((void**)&d_target, output_size * sizeof(float));

    // Allocate memory for weights and biases of hidden layers
    for (int l = 0; l < num_hidden_layers; ++l) {
        cudaMalloc((void**)&d_weights[l], (l == 0 ? input_size : hidden_sizes[l-1]) * hidden_sizes[l] * sizeof(float));
        cudaMalloc((void**)&d_biases[l], hidden_sizes[l] * sizeof(float));
        cudaMalloc((void**)&d_hidden_output[l], hidden_sizes[l] * sizeof(float));
        cudaMalloc((void**)&d_hidden_delta[l], hidden_sizes[l] * sizeof(float));

        cudaMemcpy(d_weights[l], host_weights[l], (l == 0 ? input_size : hidden_sizes[l-1]) * hidden_sizes[l] * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_biases[l], host_biases[l], hidden_sizes[l] * sizeof(float), cudaMemcpyHostToDevice);
    }

    cudaMalloc((void**)&d_output_weights, hidden_sizes[num_hidden_layers-1] * output_size * sizeof(float));
    cudaMalloc((void**)&d_output_biases, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_output, output_size * sizeof(float));
    cudaMalloc((void**)&d_output_delta, output_size * sizeof(float));
    cudaMalloc((void**)&d_loss, output_size * sizeof(float));

    cudaMemcpy(d_output_weights, host_output_weights, hidden_sizes[num_hidden_layers-1] * output_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output_biases, host_output_biases, output_size * sizeof(float), cudaMemcpyHostToDevice);

    // Copy input data and target output to device
    cudaMemcpy(d_input, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_target, host_target, output_size * sizeof(float), cudaMemcpyHostToDevice);

    // Training loop
    for (int epoch = 1; epoch < epochs+1; ++epoch) {
        // Forward pass
        float *d_input_current = d_input;
        for (int l = 0; l < num_hidden_layers; ++l) {
            linear_layer_and_activation<<<1, hidden_sizes[l]>>>(d_input_current, d_weights[l], d_biases[l], d_hidden_output[l], (l == 0 ? input_size : hidden_sizes[l-1]), hidden_sizes[l]);
            cudaDeviceSynchronize();
            d_input_current = d_hidden_output[l];
        }

        linear_layer_and_activation<<<1, output_size>>>(d_input_current, d_output_weights, d_output_biases, d_output_output, hidden_sizes[num_hidden_layers-1], output_size);
        cudaDeviceSynchronize();

        // Compute loss using current weights and biases
        compute_loss<<<1, output_size>>>(d_output_output, d_target, d_loss, output_size);
        cudaDeviceSynchronize();

        float host_loss[output_size];
        cudaMemcpy(host_loss, d_loss, output_size * sizeof(float), cudaMemcpyDeviceToHost);
        float total_loss = 0.0f;
        for (int i = 0; i < output_size; ++i) {
            total_loss += host_loss[i];
        }

        // Compute deltas
        compute_output_delta<<<1, output_size>>>(d_output_output, d_target, d_output_delta, output_size);
        cudaDeviceSynchronize();

        float *d_delta_current = d_output_delta;
        for (int l = num_hidden_layers-1; l >= 0; --l) {
            compute_hidden_delta<<<1, hidden_sizes[l]>>>(d_delta_current, (l == num_hidden_layers-1) ? d_output_weights : d_weights[l+1], d_hidden_output[l], d_hidden_delta[l], (l == num_hidden_layers-1) ? output_size : hidden_sizes[l+1], hidden_sizes[l]);
            cudaDeviceSynchronize();
            d_delta_current = d_hidden_delta[l];
        }

        // Update weights and biases
        float *d_input_previous = d_input;
        for (int l = 0; l < num_hidden_layers; ++l) {
            update_weights_and_biases<<<1, hidden_sizes[l]>>>(d_weights[l], d_biases[l], d_hidden_delta[l], (l == 0 ? d_input : d_hidden_output[l-1]), (l == 0 ? input_size : hidden_sizes[l-1]), hidden_sizes[l], learning_rate);
            cudaDeviceSynchronize();
            d_input_previous = d_hidden_output[l];
        }

        update_weights_and_biases<<<1, output_size>>>(d_output_weights, d_output_biases, d_output_delta, d_input_previous, hidden_sizes[num_hidden_layers-1], output_size, learning_rate);
        cudaDeviceSynchronize();

        if (epoch > 0) {
        std::cout << "Epoch " << epoch << " Loss: " << total_loss << std::endl;
    }
    }

    // Free device memory
    cudaFree(d_input);
    cudaFree(d_target);
    cudaFree(d_output_weights);
    cudaFree(d_output_biases);
    cudaFree(d_output_output);
    cudaFree(d_output_delta);
    cudaFree(d_loss);
    for (int l = 0; l < num_hidden_layers; ++l) {
        cudaFree(d_weights[l]);
        cudaFree(d_biases[l]);
        cudaFree(d_hidden_output[l]);
        cudaFree(d_hidden_delta[l]);
    }

    return 0;
}


Overwriting back_prop_2.cu


In [None]:
!nvcc back_prop_2.cu -o back_prop_2
!./back_prop_2

Epoch 1 Loss: 0.100841
Epoch 2 Loss: 0.100399
Epoch 3 Loss: 0.0999591
Epoch 4 Loss: 0.0995231
Epoch 5 Loss: 0.0990903
Epoch 6 Loss: 0.0986608
Epoch 7 Loss: 0.0982345
Epoch 8 Loss: 0.0978115
Epoch 9 Loss: 0.0973917
Epoch 10 Loss: 0.096975
Epoch 11 Loss: 0.0965613
Epoch 12 Loss: 0.0961508
Epoch 13 Loss: 0.0957433
Epoch 14 Loss: 0.0953388
Epoch 15 Loss: 0.0949373
Epoch 16 Loss: 0.0945388
Epoch 17 Loss: 0.0941432
Epoch 18 Loss: 0.0937505
Epoch 19 Loss: 0.0933607
Epoch 20 Loss: 0.0929737
Epoch 21 Loss: 0.0925895
Epoch 22 Loss: 0.0922081
Epoch 23 Loss: 0.0918295
Epoch 24 Loss: 0.0914536
Epoch 25 Loss: 0.0910803
Epoch 26 Loss: 0.0907098
Epoch 27 Loss: 0.0903419
Epoch 28 Loss: 0.0899767
Epoch 29 Loss: 0.089614
Epoch 30 Loss: 0.0892539
Epoch 31 Loss: 0.0888963
Epoch 32 Loss: 0.0885413
Epoch 33 Loss: 0.0881888
Epoch 34 Loss: 0.0878387
Epoch 35 Loss: 0.0874911
Epoch 36 Loss: 0.0871459
Epoch 37 Loss: 0.0868031
Epoch 38 Loss: 0.0864627
Epoch 39 Loss: 0.0861247
Epoch 40 Loss: 0.085789
Epoch 41 Loss: