<a href="https://colab.research.google.com/github/lro99/MLP_in_NP/blob/main/mlp_in_np.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("loan_data.csv")

In [None]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [None]:
#initialize parameters

def initialize_parameters(layer_dims):

    np.random.seed(44)  # For reproducibility, 42

    parameters = {}
    L = len(layer_dims)  # Number of layers in the network

    for l in range(1, L):
        # Initialize weights using He initialization for ReLU and Xavier/Glorot for others
        if "relu" in str(layer_dims[l - 1]) or "leaky_relu" in str(layer_dims[l - 1]):
            parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(2 / layer_dims[l - 1])
        else:
            parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])

        # Initialize biases to zeros
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))

    return parameters


In [None]:

def fully_connected_layer(x, W, b):
    """
    Compute the output of a fully connected layer.

    Parameters:
    - x: Input vector of shape (input_size, 1).
    - W: Weight matrix of shape (output_size, input_size).
    - b: Bias vector of shape (output_size, 1).

    Returns:
    - z: Output vector of shape (output_size, 1).
    """
    z = np.dot(W, x) + b
    return z


In [None]:
#relu function

def relu(z):
    """
    Compute the ReLU activation for a given input.

    Parameters:
    - z: Input value or array.

    Returns:
    - A: ReLU activation of the input.
    """
    A = np.maximum(0, z)
    return A


In [None]:
#sigmoid function

def sigmoid(z):
    """
    Compute the sigmoid activation for a given input.

    Parameters:
    - z: Input value or array.

    Returns:
    - A: Sigmoid activation of the input.
    """
    A = 1 / (1 + np.exp(-z))
    return A


In [None]:
#forward pass through the network

def forward_pass(X, parameters):
    """
    Perform a forward pass through the neural network.

    Parameters:
    - X: Input data of shape (input_size, m), where m is the number of examples.
    - parameters: Dictionary containing initialized weights and biases.

    Returns:
    - AL: Output of the last layer (final predictions).
    - caches: List of caches containing intermediate values for each layer.
    """

    caches = []
    A = X  # Input to the first layer is the input data

    # Loop through each layer except the last one
    for l in range(1, len(parameters) // 2):
        W = parameters[f'W{l}']     #getting the weights associated with this layer
        b = parameters[f'b{l}']     #getting the biaes associated with this layer

        Z = np.dot(W, A) + b  # Linear transformation, using the weights, biases, and the input data
        A = relu(Z)  # Activation function (use the appropriate activation function)
        #The above A becomes the new input data for the next layer. We will use this A with the weights of the next layer for the next layer
        #are using relu function for hidden layers, but sigmoid function for output layer

        cache = (A, W, b, Z)  # Cache intermediate values for backward pass
        caches.append(cache)

    # Output layer (no activation function for the last layer)
    W_output = parameters[f'W{len(parameters)//2}']
    b_output = parameters[f'b{len(parameters)//2}']
    AL = sigmoid(np.dot(W_output, A) + b_output)  # Linear transformation for the output layer

    return AL, caches


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.get_dummies(df, columns = ['purpose'], drop_first=True)

X = df.drop('credit.policy', axis = 1)
y = df['credit.policy']


In [None]:
X = df.drop('credit.policy', axis = 1)
y = df['credit.policy']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 4)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(X_train.shape, X_val.shape, X_test.shape)

(6704, 18) (1437, 18) (1437, 18)


In [None]:
input_size = X_train.shape[1]

layer_dimensions = [input_size, 64, 32, 16, 8]

params = initialize_parameters(layer_dimensions)

AL_output, caches = forward_pass(X_train.T, params)

print(AL_output)

[[0.58651898 0.54976912 0.63920951 ... 0.55451369 0.78606606 0.61400603]
 [0.47731794 0.48858321 0.36726403 ... 0.49111386 0.20781631 0.42460883]
 [0.57397437 0.55644151 0.57882842 ... 0.55552963 0.5224471  0.55677244]
 ...
 [0.51777351 0.48866856 0.48790526 ... 0.48525794 0.59523887 0.48986283]
 [0.46660456 0.48037891 0.45131107 ... 0.48514048 0.55891116 0.48883107]
 [0.41129767 0.5387897  0.34807983 ... 0.45152648 0.5153603  0.4341979 ]]


Initializing the weights using 42 as the seeding changes the output compared to initializing the weights of the model using 44 as the seeding. Comparison is shown in the two code boxes below. Comparison between the respective outputs in each model shows the model making different decisions for the same data point. For example, the first prediction of the first model has an output of 0.449, while the first prediction of the second model has an output of 0.575, indicating that the same data point would be categorized into different bins depending on which weight initialization is being used.

In [None]:
# output using 42 seed for weights, 5 total layers:
# [[0.44947922 0.4379565  0.41473257 ... 0.43933837 0.42470559 0.42826027]
#  [0.49938151 0.53642777 0.59573421 ... 0.54253857 0.63319478 0.59870072]
#  [0.51068375 0.54076568 0.53287099 ... 0.50973727 0.54857595 0.59634047]
#  ...
#  [0.51560929 0.52952328 0.52186622 ... 0.45314267 0.48232319 0.54466107]
#  [0.50485357 0.51393171 0.5509693  ... 0.51478935 0.56387069 0.57108607]
#  [0.51425811 0.45130172 0.44005476 ... 0.51321277 0.53805588 0.52757862]]

In [None]:
# output using 44 seed for weights, 5 total layers:
# [[0.57514627 0.45362831 0.56301453 ... 0.5121625  0.49354015 0.57943614]
#  [0.5462321  0.55284591 0.59226709 ... 0.5149443  0.50777261 0.57796563]
#  [0.55823962 0.60953549 0.54417225 ... 0.54415799 0.53494227 0.46879427]
#  ...
#  [0.46064935 0.5368371  0.56037433 ... 0.50482341 0.4589306  0.52634649]
#  [0.45686018 0.43291895 0.35892164 ... 0.39889012 0.48998121 0.42637816]
#  [0.50426535 0.49667608 0.54044721 ... 0.53639567 0.44636753 0.40521653]]

The architecture of the model is changed from 7 total layers to 6 total layers in the two code boxes below. In the second box, the layer of 128 nodes is eliminated. The output of the model is changed but the classifications tend to be more conisistent than two models with different initial weights. For example, in nodes 6-8 of the output layer for both models, most of the classifications from one model to the other model tend to stay the same.

In [None]:
# output using 44 seed for weights, 7 layers (input_size, 256, 128, 64, 32, 16, 8)
# [[0.48948346 0.48886314 0.51134798 ... 0.49744945 0.58357369 0.5439165 ]
#  [0.51626725 0.52154744 0.5385     ... 0.50638072 0.59303105 0.55473561]
#  [0.50657948 0.5239321  0.51558508 ... 0.48851091 0.50134086 0.52053463]
#  ...
#  [0.490998   0.48780191 0.49472759 ... 0.49341087 0.52701609 0.48173752]
#  [0.52307271 0.54792756 0.49586833 ... 0.52665909 0.54488522 0.50139908]
#  [0.50906957 0.484195   0.5036739  ... 0.52704451 0.51114608 0.47510923]]

In [None]:
# output using 44 seed for weights, 6 layers (input_size, 256, 64, 32, 16, 8)
# [[0.57014613 0.52382215 0.52221409 ... 0.53486156 0.56770563 0.62095855]
#  [0.48783343 0.48924647 0.45699233 ... 0.45640418 0.39047119 0.43932993]
#  [0.5330569  0.52809136 0.57969741 ... 0.57043987 0.61188944 0.65004017]
#  ...
#  [0.3303512  0.43319965 0.38166686 ... 0.37276715 0.29133827 0.39503282]
#  [0.51411731 0.56954048 0.5742481  ... 0.56137118 0.58761634 0.5961994 ]
#  [0.47622982 0.53163762 0.45910918 ... 0.4818462  0.49108354 0.45222534]]

For the code below, the only difference from the above two models was the reduction of two layers. Even with less layers, classification generally stayed consistent across both models.

In [None]:
# output using 44 seed for weights, 4 total layers (input_size, 64, 32, 16, 8):
# [[0.58651898 0.54976912 0.63920951 ... 0.55451369 0.78606606 0.61400603]
#  [0.47731794 0.48858321 0.36726403 ... 0.49111386 0.20781631 0.42460883]
#  [0.57397437 0.55644151 0.57882842 ... 0.55552963 0.5224471  0.55677244]
#  ...
#  [0.51777351 0.48866856 0.48790526 ... 0.48525794 0.59523887 0.48986283]
#  [0.46660456 0.48037891 0.45131107 ... 0.48514048 0.55891116 0.48883107]
#  [0.41129767 0.5387897  0.34807983 ... 0.45152648 0.5153603  0.4341979 ]]


In [None]:
#weights

params['W1']


array([[-0.17692159,  0.3102684 ,  0.29371802, ...,  0.08505459,
         0.24998823, -0.02761303],
       [ 0.19461107, -0.28515579, -0.28060439, ...,  0.19450678,
         0.10045713,  0.0461128 ],
       [-0.1211774 , -0.7244754 , -0.09313989, ...,  0.34066933,
         0.01150776,  0.02129087],
       ...,
       [ 0.14806878, -0.07492891, -0.14535469, ..., -0.1178558 ,
         0.01897576,  0.27236301],
       [ 0.21098298, -0.03191905,  0.00716181, ...,  0.0200216 ,
         0.00825146, -0.14967674],
       [ 0.14951427,  0.47289721, -0.22449289, ...,  0.39311828,
         0.37948183,  0.06418796]])

In [None]:
# weights using 42 seed
# [ 0.11707665, -0.03258921,  0.15266165, ..., -0.13253244,
#         -0.23872658,  0.07406881],
#        [-0.21402333, -0.33288317,  0.34545673, ..., -0.24930486,
#          0.1938757 , -0.28775561],
#        [ 0.04922962, -0.46189868, -0.31305645, ..., -0.09076476,
#         -0.15955205,  0.14417348],
#        ...,
#        [-0.03006362,  0.35618263, -0.34204539, ...,  0.28107672,
#         -0.41921711,  0.07534266],
#        [-0.11883633, -0.01921508,  0.08194813, ..., -0.23126293,
#         -0.10946113,  0.1089088 ],
#        [ 0.18464691, -0.05928828, -0.1408345 , ..., -0.23524515,
#         -0.0905029 ,  0.05897275]]

In [None]:
# Weights using 44 seed
# [[-0.17692159,  0.3102684 ,  0.29371802, ...,  0.08505459,
#          0.24998823, -0.02761303],
#        [ 0.19461107, -0.28515579, -0.28060439, ...,  0.19450678,
#          0.10045713,  0.0461128 ],
#        [-0.1211774 , -0.7244754 , -0.09313989, ...,  0.34066933,
#          0.01150776,  0.02129087],
#        ...,
#        [-0.17510662, -0.1266536 ,  0.12091032, ..., -0.45464054,
#          0.03071322,  0.10051081],
#        [-0.18665291,  0.61820224, -0.3080887 , ..., -0.14765817,
#         -0.20549358,  0.03001243],
#        [ 0.1770988 , -0.06944502, -0.21267285, ...,  0.42746559,
#          0.22332741, -0.08536218]]