In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np 
import sklearn.datasets 
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import math
import random
from PIL import Image
import PIL

In [2]:
def sigmoid(x):
    s = 1/(1+np.exp(-x))
    return s

def relu(x):
    s = np.maximum(0,x)
    return s

def softmax(a4):
    print(a4.shape)
    print("z4 = {}".format(a4))
    y_out = np.exp(a4)
    div = np.sum(y_out,axis = 0)
    y_out /= div
    print("y_out = {}".format(y_out))
    print("maximum_index : {}".format(np.argmax(y_out,axis=0)))
    return y_out

def softmax_cost(y_hat,y_out):
    epsilon = 1e-8
    return -np.sum(y_out*(np.log(y_hat + epsilon)))

In [3]:
def initialize_parameters(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims) # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*  np.sqrt(2 / layer_dims[l-1])
        x = np.mean(parameters['W' + str(l)])
        print("w{} = {}".format(l,parameters['W' + str(l)]))
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        #print("b{} = {}".format(l,parameters['b' + str(l)]))
        
        assert parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1])
        assert parameters['b' + str(l)].shape == (layer_dims[l], 1)
        
    return parameters


def compute_cost(a3, Y):
    m = Y.shape[1]
    
    logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
    cost = 1./m * np.sum(logprobs)
    
    return cost

In [4]:
def forward_propagation(X, parameters):  
    # retrieve parameters
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    W4 = parameters["W4"]
    b4 = parameters["b4"]
    
    
    
    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> RELU ->SOFTMAX
    z1 = (np.dot(W1, X) + b1)/10
    #print("z1 = {}".format(z1))
    a1 = relu(z1)
    #print("a1 = {}".format(a1))
    z2 = (np.dot(W2, a1) + b2)/10
    a2 = relu(z2)
    #print("a2 = {}".format(a2))
    z3 = (np.dot(W3, a2) + b3)/10
    a3 = relu(z3)
    #print("a3 = {}".format(a3))
    z4 = (np.dot(W4, a3) + b4)/10
    a4 = softmax(z4)
    
    cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3, a4, W4, b4)
    
    return a4, cache

In [5]:
def random_mini_batches(X, Y, classes, mini_batch_size = 64, seed = 0): #non default parameters must be together
    """
    Creates a list of random minibatches from (X, Y)
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    
    np.random.seed(seed)            # To make your "random" minibatches the same as ours
    m = X.shape[1]                  # number of training examples
    mini_batches = []
        
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((classes,m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:,k*mini_batch_size:(k+1)*mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k*mini_batch_size:(k+1)*mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch_Y = shuffled_Y[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [6]:
def backward_propagation(X, Y, cache):
    '''it is for a 4 layered neural network'''
    m = X.shape[1]
    (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3, a4, W4, b4) = cache
    
    dz4 = 1./m * (a4 - Y)
    dW4 = np.dot(dz4,a3.T)
    db4 = np.sum(dz4, axis=1, keepdims = True)
    
    da3 = np.dot(W4.T,dz4)
    dz3 = np.multiply(da3, np.int64(a3 > 0))
    dW3 = np.dot(dz3, a2.T)
    db3 = np.sum(dz3, axis=1, keepdims = True)
    
    da2 = np.dot(W3.T, dz3)
    dz2 = np.multiply(da2, np.int64(a2 > 0))
    dW2 = np.dot(dz2, a1.T)
    db2 = np.sum(dz2, axis=1, keepdims = True)
    
    da1 = np.dot(W2.T, dz2)
    dz1 = np.multiply(da1, np.int64(a1 > 0))
    dW1 = np.dot(dz1, X.T)
    db1 = np.sum(dz1, axis=1, keepdims = True)
    
    gradients = {"dW4": dW4, "dz4": dz4, "db4": db4, "dz3": dz3, "dW3": dW3, "db3": db3,"da3": da3,
                 "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
                 "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
    
    return gradients


In [7]:
def initialize_adam(parameters) :
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
    ### START CODE HERE ### 
        v["dW" + str(l+1)] = np.zeros((parameters['W' + str(l+1)].shape))
        v["db" + str(l+1)] = np.zeros((parameters['b' + str(l+1)].shape))
        s["dW" + str(l+1)] = np.zeros((parameters['W' + str(l+1)].shape))
        s["db" + str(l+1)] = np.zeros((parameters['b' + str(l+1)].shape))
    
    return v, s

In [8]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        v["dW" + str(l+1)] = beta1*(v['dW' + str(l+1)]) + (1 - beta1)*(grads['dW' + str(l+1)])
        v["db" + str(l+1)] = beta1*(v['db' + str(l+1)]) + (1 - beta1)*(grads['db' + str(l+1)])

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] /(1 - beta1**t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] /(1 - beta1**t)

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: 
        s["dW" + str(l+1)] = beta2*(s['dW' + str(l+1)]) + (1 - beta2)*(grads['dW' + str(l+1)]**2)
        s["db" + str(l+1)] = beta2*(s['db' + str(l+1)]) + (1 - beta2)*(grads['db' + str(l+1)]**2)

        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)]/(1 - beta2**t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)]/(1 - beta2**t)

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        parameters["W" + str(l+1)] -= (v_corrected["dW" + str(l+1)]/np.sqrt(s_corrected["dW" + str(l+1)] + epsilon))*learning_rate
        parameters["b" + str(l+1)] -= (v_corrected["db" + str(l+1)]/np.sqrt(s_corrected["db" + str(l+1)] + epsilon))*learning_rate

    return parameters, v, s

In [None]:
df = pd.read_csv(r'D:\csv_database\data\train.csv')
data_x = np.array(df['image_id']).reshape(1,-1)
data_y = np.array(df['category']).reshape(1,-1)
Y_out = OneHotEncoder(sparse=False).fit_transform(data_y.T)
Y_out = Y_out.T
mini_batches = random_mini_batches(data_x,Y_out,classes = 102)
mini_batches_val = random_mini_batches(data_x,data_y,classes = 1)

#initialization portion
image_pixel = {}
parameters = {}
#cache = ()
gradients = {}
v = {}
s = {}
t = 0
basewidth = 64

layer_dims = [12288,3000,1000,150,102]
parameters = initialize_parameters(layer_dims)
v,s = initialize_adam(parameters)
l = 0

for minibatch in mini_batches:
    (minibatch_x,minibatch_y) = minibatch
    image_x = np.zeros((12288,1))
    for i in minibatch_x[0]:
        path = r"D:\csv_database\data\train\\" +str(i)
        path += ".jpg"
        image_pixel['p' + str(i)] = (Image.open(path))
        wpercent = (basewidth / float(image_pixel['p'+str(i)].size[0]))
        hsize = int((float(image_pixel['p'+str(i)].size[1]) * float(wpercent)))
        image_pixel['p' + str(i)] = image_pixel['p' + str(i)].resize((basewidth, hsize),PIL.Image.ANTIALIAS)
        image_pixel['p' + str(i)] = (np.array(image_pixel['p'+ str(i)])/255).reshape(-1,1)
        image_x = np.hstack((image_x,image_pixel['p' + str(i)]))

    image_x = image_x[:,1:].T
    print("image_x = {}".format(image_x))
    image_y = minibatch_y

    (xtemp,Y_real) = mini_batches_val[l]
    l += 1

    x_train,x_test,y_train,y_test = train_test_split(image_x,image_y.T,test_size=0.2,random_state=42)
    x_tr,x_te,Y_real_train,Y_real_test = train_test_split(image_x,Y_real.T,test_size=0.2,random_state=42)

    x_input = x_train.T
    y_output = y_train.T 
    
    '''computation process starts
          Forward propagation'''
          
    y_hat,cache = forward_propagation(x_input,parameters)
    print("cost at minibatch {} is : {} ".format(l,softmax_cost(y_hat,y_output)))
    gradients = backward_propagation(x_input,y_output,cache)
    t += 1
    parameters,v,s = update_parameters_with_adam(parameters,gradients,v,s,t)
    
    print("w1 : {}".format(parameters['W1']))
    #print("b1 : {}",parameters['b1'])
    print("w2 : {}".format(parameters['W2']))
    #print("b2 : {}",parameters['b2'])
    print("w3 : {}".format(parameters['W3']))
    #print("b3 : {}",parameters['b3'])
    print("w4 : {}".format(parameters['W4']))
    #print("b4 : {}".format(parameters['b4']))
    