# based on the work of Samson Zhang 

https://www.youtube.com/watch?v=w8yWXqWQYmU

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train = np.array(train)
m,n = train.shape
np.random.shuffle(train)

train_dev = train[0:1000].T
Y_dev = train_dev[0]
X_dev = train_dev[1:n]
X_dev = X_dev / 255.


train_tra = train[1000:m].T
Y_train = train_tra[0]
X_train = train_tra[1:n]
X_train = X_train / 255.
# _,m_train = X_train.shape

In [4]:
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size,Y.max()+1))
    one_hot_Y[np.arange(Y.size),Y]=1
    one_hot_Y=one_hot_Y.T
    
    return one_hot_Y

def softmax(n):
    n -= np.max(n, axis=0)
    a = np.exp(n)/np.sum(np.exp(n),axis=0)

    return a

In [5]:
def ReLU(n):
    return np.maximum(0,n)

def deriv_ReLU(n):
    return n>0

In [6]:
def get_predictions(A):
    
    return np.argmax(A, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    
    return np.sum(predictions == Y) / Y.size

In [7]:
def int_params():
    W1 = np.random.randn(28, 784) * np.sqrt(1. / 784)
    b1 = np.random.randn(28, 1) * np.sqrt(1. / 28)
    W2 = np.random.randn(10, 28) * np.sqrt(1. / 28)
    b2 = np.random.randn(10, 1) * np.sqrt(1. / 10)
    W3 = np.random.randn(10, 10) * np.sqrt(1. / 10)
    b3 = np.random.randn(10, 1) * np.sqrt(1. / 10)
    return W1, b1, W2, b2, W3, b3

def forward_drop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1 @ X + b1
    A1 = ReLU(Z1)
    Z2 = W2 @ A1 + b2
    A2 = ReLU(Z2)
    Z3 = W3 @ A2 + b3
    A3 = softmax(Z3)
    return Z1, Z2, Z3, A1, A2, A3

def back_prop(Z1, Z2, Z3, A1, A2, A3, Y, X, W1, W2, W3, one_hot_Y):
    m = X.shape[1]
    dZ3 = A3 - one_hot_Y
    dW3 = 1/m * (dZ3 @ A2.T)
    db3 = 1/m * np.sum(dZ3, axis=1, keepdims=True)
    dZ2 = (W3.T @ dZ3) * deriv_ReLU(Z2)
    dW2 = 1/m * (dZ2 @ A1.T)
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = (W2.T @ dZ2) * deriv_ReLU(Z1)
    dW1 = 1/m * (dZ1 @ X.T)
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)
    return dW1, dW2, dW3, db1, db2, db3

def update_params(W1, b1, W2, b2, W3, b3, dW1, dW2, dW3, db1, db2, db3, alpha):
    W1 -= alpha * dW1
    b1 -= alpha * db1
    W2 -= alpha * dW2
    b2 -= alpha * db2
    W3 -= alpha * dW3
    b3 -= alpha * db3
    return W1, b1, W2, b2, W3, b3

def gradient_descent(X, Y, iteration, alpha,W1, b1, W2, b2, W3, b3):
    #W1, b1, W2, b2, W3, b3 = int_params() #If run int_params(), the model starts over
    one_hot_Y = one_hot(Y)
    for i in range(iteration):
        Z1, Z2, Z3, A1, A2, A3 = forward_drop(W1, b1, W2, b2, W3, b3, X)
        dW1, dW2, dW3, db1, db2, db3 = back_prop(Z1, Z2, Z3, A1, A2, A3, Y, X, W1, W2, W3, one_hot_Y)
        W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, dW2, dW3, db1, db2, db3, alpha)
        if (i % 50 == 0):
            print('iteration: ', i)
            predictions = get_predictions(A3)
            print('Accuracy: ', get_accuracy(predictions, Y))
        if(i==iteration-1):
            print('iteration: ', i)
            predictions = get_predictions(A3)
            print('Accuracy: ', get_accuracy(predictions, Y))
    return W1, b1, W2, b2, W3, b3




# If run int_params(), the model starts over; otherwise, the model continues.

In [8]:
W1, b1, W2, b2, W3, b3 = int_params()

# It can be run in chain using different activation functions.

In [12]:
# Training the neural network
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, 250, 0.1,W1, b1, W2, b2, W3, b3)

iteration:  0
[2 8 5 ... 1 8 9] [2 8 5 ... 1 8 9]
Accuracy:  0.9499268292682926
iteration:  50
[2 8 5 ... 1 8 9] [2 8 5 ... 1 8 9]
Accuracy:  0.9504878048780487
iteration:  100
[2 8 5 ... 1 8 9] [2 8 5 ... 1 8 9]
Accuracy:  0.9512195121951219
iteration:  150
[2 8 5 ... 1 8 9] [2 8 5 ... 1 8 9]
Accuracy:  0.9521463414634146
iteration:  200
[2 8 5 ... 1 8 9] [2 8 5 ... 1 8 9]
Accuracy:  0.9529024390243902
iteration:  249
[2 8 5 ... 1 8 9] [2 8 5 ... 1 8 9]
Accuracy:  0.9532682926829268


rule  : 0.909  tanh : 0.91 sigmoid : 0.484 leaky_relu : 0.899 swish : 0.903 ((X_train, Y_train, 500, 0.1)

rule  : 0.912  tanh : 0.91 sigmoid : 0.411 leaky_relu : 0.904  swish : 0.909 ((X_train, Y_train, 1000, 0.05)

rule  : 0.902  tanh : 0.905 sigmoid : 0.454 leaky_relu : 0.916  swish : 0.902 ((X_train, Y_train, 250, 0.2)

# Overrun ReLU and deriv_ReLU, and run the model. If run int_params(), the model starts over; otherwise, the model continues.

https://en.wikipedia.org/wiki/Activation_function

In [45]:
def ReLU(n):
    return np.maximum(0,n)
def deriv_ReLU(n):
    return n>0

In [17]:
#tanh
def ReLU(x):
    return np.tanh(x)

def deriv_ReLU(x):
    return 1 - np.tanh(x)**2

In [34]:
# #sigmoid mabe the resolt is por becouse the sigmoid is colcolatid rong in the pross
# def ReLU(x):
#     return 1 / (1 + np.exp(-x))

# def deriv_ReLU(x):
#     return ReLU(x) * (1 - ReLU(x))

In [48]:
#leaky_relu
def ReLU(x):
    return np.where(x > 0, x, x * 0.01)
def deriv_ReLU(x):
    return np.where(x > 0, 1, 0.01)


In [9]:

#swish


def ReLU(x):
    return x * 1 / (1 + np.exp(-x))
def deriv_ReLU(x):
    sigmoid_beta_x = 1 / (1 + np.exp(-x))
    return sigmoid_beta_x * (1 +  (1 - sigmoid_beta_x))


# For Kaggle submission, run the model on the test data.

In [13]:
test = pd.read_csv('test.csv')

In [14]:
def normaliz_values(x):
        return x/255
    
# Apply the function to each cell in the DataFrame, excluding the 'label' column
for column in test:
    test[column] = test[column].apply(normaliz_values)

# Display the updated DataFrame
test

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def pred_chans(i):
    given_v = test.iloc[i,:].values
    arr = pd.DataFrame(W3@(W2@(W1@given_v+np.array(b1.T)).T+b2)+b3)
    arr.columns = [i]  # Assigning column name
    return arr

# Initialize an empty list to collect arrays
array_list = []

for i in range(len(test)):
    temp = pred_chans(i)  # Assuming for_train(i) returns a numpy array
    array_list.append(temp)

# Concatenate all arrays in the list into a single DataFrame
new_train = pd.concat(array_list, axis=1)

new_train


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27990,27991,27992,27993,27994,27995,27996,27997,27998,27999
0,1.803138,14.431058,-5.3104,1.988539,-4.153629,-2.295583,17.84041,-1.802085,13.413227,-2.303158,...,1.459928,-5.506145,-4.745354,-4.305911,6.688442,-6.236935,1.039714,-6.766006,-4.677705,-0.643619
1,-16.115436,-11.043392,-3.805502,-6.879002,-1.69279,-8.701677,-13.972314,-7.78487,-11.848082,-4.688149,...,-9.571567,-6.50954,1.603609,-7.522761,-9.227119,-10.804157,-4.06508,-3.122021,-9.811288,-14.861325
2,18.976147,2.00515,-3.365058,7.828126,1.389998,0.967574,0.636368,-6.535457,-1.897161,-6.913034,...,4.683927,-7.650922,-0.458447,-0.13472,-1.030614,-1.544964,-0.491045,-6.810785,-9.419354,9.990734
3,3.246354,-1.923742,-2.767403,-11.522607,7.509829,-2.953556,-1.271172,9.252657,3.390275,7.996748,...,6.03134,-5.360043,2.139552,-11.298102,-7.735294,-0.822285,-1.19949,11.708545,-4.583462,-0.513153
4,-7.912637,-15.817865,0.761704,7.382189,-10.73846,-1.446404,-8.288993,-6.389288,-15.303363,-4.29563,...,-11.292971,1.166386,-4.772553,8.06397,-0.168269,0.778091,-2.559379,-10.20158,6.256819,-1.928413
5,-1.34268,12.546525,0.039234,-9.364281,-0.361726,-6.602421,5.919127,1.939595,12.505329,7.189962,...,-2.498538,-0.263299,-0.181563,-11.766272,-4.170902,-6.729033,-3.96102,0.760468,-6.970487,1.4825
6,-4.382111,-2.885661,-5.324941,2.839664,-3.714597,-4.722872,-1.14772,-11.1632,-10.3806,-9.299196,...,-15.677302,9.498467,-4.189883,1.166753,-2.785782,-6.100274,-7.678114,-18.698073,-4.840397,-3.236789
7,-2.685745,-6.65758,-0.084596,10.196513,-10.791056,10.904061,-16.677064,-16.371262,-8.973629,-17.131139,...,13.711971,-23.109129,-0.903364,5.421894,11.525835,4.397718,11.508662,-6.159768,-3.753057,-9.856851
8,8.426595,3.788067,1.697823,7.052996,0.220783,3.668751,2.451224,-3.089633,2.15106,-2.328882,...,-0.050307,2.588222,0.77373,6.517533,4.329309,0.360333,0.298895,0.484318,0.837634,8.233925
9,-6.945891,-5.52208,5.632615,7.640524,-9.624253,8.877607,-6.294705,-4.7985,-1.070133,-8.561049,...,4.559761,-6.389633,0.514194,8.30434,12.480422,8.190631,9.222093,-0.260754,10.754867,-3.346185


In [16]:
def replace_max_within_column(df):
    # Find the column index of the maximum value in each row
    max_indices = df.idxmax(axis=1)
    
    # Create a new DataFrame to store the results
    result_df = pd.DataFrame({'bb': df.index})
    
    # Add the column names corresponding to the maximum values for each row
    result_df['Max_Column'] = df.columns[max_indices.values]
    
    return result_df

In [17]:
result = replace_max_within_column(new_train.T)
result=result.drop(['bb'], axis=1)

In [18]:
result = result.assign(ImageId=range(1, len(result) + 1)).rename(columns={'Max_Column': 'Label'})
result  = result[['ImageId', 'Label']]
result

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,7
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


In [19]:
result.to_csv("nn swish 2250 0.1.csv",index=False)

nn ReLU 1000 0.1 : 0.86 | nn tanh 1000 0.1 : 0.87 | nn swish 1000 0.1 : 0.89 | nn swish 2250 0.1 : 0.81 