# 0. NeuralNetwork class

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def relu(x):
    return np.maximum(x,0)

def relu_derivative(x):
    y = x > 0
    return y.astype(int)

def sigmoid(x):
    return 1/(1+np.exp(x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1-sigmoid(x))

def tanh(x):
    return (np.exp(x)-np.exp(-1*x))/(np.exp(x)+np.exp(-1*x))

def tanh_derivative(x):
    return 1 - (tanh(x)**2)

In [3]:
# normalize each column
df = pd.read_csv('data.csv',index_col=0)
for col in df:
    df[col] = (df[col] - df[col].min())/(df[col].max() - df[col].min())

In [4]:
data = df[['num_room', 'office_count_5000', 'culture_objects_top_25_raion',
           'ekder_all', 'floor', 'bandwidth_sports', 'park_km',
           'big_church_count_5000', 'indust_part', 'healthcare_centers_raion',
           'mosque_count_5000', 'ecology', 'big_road1_km', 'fitness_km',
           'big_market_raion', 'product_type', 'railroad_terminal_raion',
           'university_top_20_raion', 'nuclear_reactor_raion', 'cpi',
           'green_zone_part', 'work_all', 'children_school', 'radiation_raion']]
y = df[['price_doc']]

In [11]:
class NeuralNetwork:
    
    """
    A class representing a fully-connected, feed-forward neural network.
    
    Params:
        nlayers: the total number of hidden layers in the network
        nnodes: an array containing the number of nodes in each layer
        activations: an array containing the names of activation functions to use in each layer
            (note that activations[0] and activations[nlayers+2] will not be used)
            
    Notes:
        All hidden layers and the input layer also include an intercept node, not counted
        in nnodes. Intercept nodes are not connected to any nodes in the previous layer
        and have a value of 1; they are connected to all nodes in the next layer.
    """
    
    def __init__(self, nlayers, nnodes, activations):
        assert nlayers == len(nnodes) - 2
        assert nlayers == len(activations) - 2
        self.nlayers = nlayers
        self.nnodes = nnodes
        self.activations = activations
        self.weights = self.initialize_weights()
        self.z = []
        self.h = []
        
    def initialize_weights(self):
        """ 
        Randomly initialize all weights to numbers in [-0.25,0.25].
        
        Returns:
            An array of length nlayers+1 where element i is an array of length 
            (nnodes[i]+1)*nnodes[i+1] containing random weights for each edge between
            layers i and i+1, including weights corresponding to the intercept node.
        """
        weights = []
        for i in range(self.nlayers+1):
            weights_i = []
            for j in range((self.nnodes[i]+1)*self.nnodes[i+1]):
                weights_i.append(np.random.uniform(-0.25,0.25))
            weights.append(weights_i)
        return weights
    
    def chunker(self, seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    
    def forward_prop(self, input_data):
        """ 
        Propagates input matrix x through the neural network, saving intermediate values
        of z and h. Adds intercept node to each layer.
        
        Params:
            inputs: an array of arrays of input values
        
        Returns:
            an array of output values
        """
        assert len(input_data[0]) == self.nnodes[0]
        self.z=[]
        inputs=[0]
        z=[]
        inputs[0] = np.repeat(1,len(input_data))
        for i in range(len(input_data[0])):
            add = np.array([j[i] for j in input_data])
            inputs.append(add)
            z.append(add)
        self.z.append(z)
        self.h = self.z.copy()
        for i in range(self.nlayers+1):
            new_nodes = [0] * self.nnodes[i+1]
            p = 0
            #go through the weights for each input node one group at a time 
            for w in self.chunker(self.weights[i], len(new_nodes)):
                for j in range(len(w)):
                    # add the value corresponding to the jth node in the next layer
                    new_nodes[j]= new_nodes[j]+ w[j]*inputs[p]
                p += 1
            self.z.append(new_nodes)
            self.h.append([self.activations[i+1](j) for j in new_nodes])
            inputs = self.h[i+1].copy()
            inputs.insert(0,np.repeat(1,len(inputs[0])))
        return self.z[self.nlayers+1]
    
    def back_prop(self, y_pred, y, rate, derivs):
        """ 
        Performs one iteration of backpropagation.
        
        Params:
            y_pred: the predicted y value
            y: the true y value
            rate: the learning rate
            derivs: array of the derivatives of each activation function
                (note again that derivs[0] and derivs[nlayers+2] will not be used))
        
        Returns:
            an array of updated weights
        """
        deltas = y_pred - y[0]
        new_weights = []
        for layer in range(self.nlayers,-1,-1):
            i=0
            new_w = []
            n = len(self.h[layer][0])
            h_vals = self.h[layer].copy()
            h_vals.insert(0,np.repeat(1,n))
            for h in h_vals:
                for d in deltas:
                    #sum weight changes across observations
                    changes = np.sum(h*d)
                    old_w = self.weights[layer][i]
                    new_w.append(old_w - rate*changes)
                    i+=1
            new_weights.insert(0,new_w)
            i=len(self.z[layer+1])
            new_deltas = []
            for z in self.z[layer]:
                new_d=0
                for d in deltas:
                    new_d += derivs[layer](z)*d*self.weights[layer][i]
                    i+=1
                new_deltas.append(new_d)
            deltas = new_deltas
        self.weights = new_weights
                
    def gradient_descent(self, data, y_val, rate, batch_size, derivs, tol):
        """ 
        Performs stochastic gradient descent to train the weights of the neural network.
        
        Params:
            data: the full dataset, not including the target variable
            y_val: the target column
            rate: the learning rate
            batch_size: the number of observations in each batch
            derivs: array of the derivatives of each activation function
                (note again that derivs[0] and derivs[nlayers+2] will not be used))
            tol: tolerance (difference in MSEs to stop at)
        
        Returns:
            an array of weights of optimal neural network
        """
        diff = 100000
        new_MSE = 0
        while diff > tol:
            old_MSE = new_MSE
            new_MSE = 0
            prev = 0
            while prev < len(data):
                nxt = prev+batch_size
                if nxt > len(data):
                    nxt = len(data)
                xs = data.values[prev:nxt]
                ys = y_val.values[prev:nxt]
                prev = nxt
                y_pred = self.forward_prop(xs)
                self.back_prop(y_pred,ys,rate,derivs)
                new_MSE += np.sum((self.forward_prop(xs)[0]-ys[0])**2)
            new_MSE = new_MSE/len(data)
            diff = abs(old_MSE - new_MSE)
            print("MSE = "+str(new_MSE))
        return new_MSE

## 1. Exploration of nnodes

In [69]:
nodes = [30,25,20,15,10,5]
act = [tanh, sigmoid, relu]
der = [tanh_derivative, sigmoid_derivative, relu_derivative]

for node in nodes:
    for (f,d) in zip(act,der):
        nnodes = [24]
        activations = [relu]
        derivs = [relu_derivative]
        nnodes.append(node)
        activations.append(f)
        derivs.append(d)
        nnodes.append(1)
        activations.append(relu)
        derivs.append(relu_derivative)
        print("1 layer | " + str(node) + " nodes | " + str(f))
        nn = NeuralNetwork(nlayers=1, nnodes=nnodes, activations=activations)
        print("\t MSE = " + str(nn.gradient_descent(data,y,0.001,100,derivs,0.000001)))

1 layer | 30 nodes | <function tanh at 0x7f12ea42cf28>
	 MSE = 0.00015857060495233754
1 layer | 30 nodes | <function sigmoid at 0x7f12ea1e4c80>
	 MSE = 0.00014449458373616648
1 layer | 30 nodes | <function relu at 0x7f12ea17cf28>
	 MSE = 0.0012775878677628895
1 layer | 25 nodes | <function tanh at 0x7f12ea42cf28>
	 MSE = 0.0001294302115504122
1 layer | 25 nodes | <function sigmoid at 0x7f12ea1e4c80>
	 MSE = 9.050155408528709e-05
1 layer | 25 nodes | <function relu at 0x7f12ea17cf28>
	 MSE = 0.0014611746033805063
1 layer | 20 nodes | <function tanh at 0x7f12ea42cf28>
	 MSE = 0.00017197283771894212
1 layer | 20 nodes | <function sigmoid at 0x7f12ea1e4c80>
	 MSE = 8.623543771199027e-05
1 layer | 20 nodes | <function relu at 0x7f12ea17cf28>
	 MSE = 0.0014883837122360473
1 layer | 15 nodes | <function tanh at 0x7f12ea42cf28>
	 MSE = 0.000216297387367162
1 layer | 15 nodes | <function sigmoid at 0x7f12ea1e4c80>
	 MSE = 0.00047199819345060325
1 layer | 15 nodes | <function relu at 0x7f12ea17c

## 2. Random & Grid Search

In [6]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import sys
import random

def random_search(X, y, num_models=50, 
                  layer_options=[1,2,3,4,5], 
                  nnodes_options=[5, 8, 10, 15, 20, 25, 30, 40]):
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0,test_size=0.2)
    results = []
    act_options = [(tanh,tanh_derivative), (sigmoid,sigmoid_derivative), (relu,relu_derivative)]
    best_mse = sys.maxsize
    best_model = None
    for model in range(num_models):
        nlayers = random.choice(layer_options)
        nnodes = [24]
        activations = [relu]
        derivs = [relu_derivative]
        for layer in range(nlayers):
            nnodes.append(random.choice(nnodes_options))
            activ = random.choice(act_options)
            activations.append(activ[0])
            derivs.append(activ[1])
        nnodes.append(1)
        activations.append(relu)
        derivs.append(relu_derivative)
        nn = NeuralNetwork(nlayers=nlayers, nnodes=nnodes, activations=activations)
        nn.gradient_descent(X_train,y_train,0.001,100,derivs,0.000001)
        y_pred = nn.forward_prop(X_test.values)[0]
        mse = mean_squared_error(y_test.values, y_pred)
        r2 = r2_score(y_test.values, y_pred)
        result = formulate_random_result(nlayers, nnodes, activations, derivs, mse, r2)
        results.append(result)
        if mse < best_mse:
            best_model = result
            best_mse = mse
        del nn
    print("selected best model: "+str(best_model))
    return results, generate_model(best_model)

def grid_search(X, y, model, learning_rates=[.00001, .0001, .001], batch_sizes=[10, 30, 50, 100, 1000]):
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)
    results = []
    best_mse = sys.maxsize
    best_params = None
    nlayers, nnodes, activations = model
    for learning_rate in learning_rates:
        for batch_size in batch_sizes:
            nn = NeuralNetwork(nlayers=nlayers, nnodes=nnodes, activations=activations)
            nn.gradient_descent(X_train,y_train,learning_rate,batch_size,derivs,0.00001)
            y_pred = nn.forward_prop(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_true, y_pred)
            result = formulate_grid_result(nlayers, nnodes, activations, mse, r2)
            results.append(result)
            if mse < best_mse:
                best_params = result
                best_mse = mse
            del nn
    print("selected best hyperparameters: " + str(best_params))
    return results, generate_params(best_params)

def cross_validate_new(X, y, model, folds, params):
        sp = len(X)/folds
        err = []
        nlayers, nnodes, activations = model
        learning_rate, batch_size = params
        for i in range(folds):
            lower = int(sp * i)
            upper = int(sp * (i+1))
            test_x = X.iloc[lower:upper]
            test_y = y.iloc[lower:upper]
            train_x = pd.concat((X.iloc[:lower], X.iloc[upper:]))
            train_y = pd.concat((y.iloc[:lower], y.iloc[upper:]))    
            nn = NeuralNetwork(nlayers=nlayers, nnodes=nnodes, activations=activations)
            nn.gradient_descent(X_train,y_train,learning_rate,batch_size,derivs,0.0000001)
            y_pred = nn.forward_prop(X_test)
            mse = mean_squared_error(y_test.values, y_pred)
            err.append(mse)
        cur_err = sum(err)/len(err)
        print("average MSE: " + str(cur_err))

def cv_pipeline(X, y):
    print("running random search... ")
    model = random_search(X, y)[1]
    print("running grid search...")
    params = grid_search(X, y, model)[1]
    nlayers, nnodes, activations = model
    learning_rate, batch_size = params
    scores = cross_validate_new(X, y, model, 5, params)
    return scores

def generate_model(model_dict):
    nlayers = model_dict["nlayers"]
    nnodes = model_dict["nnodes"]
    activations = model_dict["activations"]
    return nlayers, nnodes, activations

def generate_params(param_dict):
    learning_rate = param_dict["nlayers"]
    batch_size = param_dict["nnodes"]
    return learning_rate, batch_size


def formulate_random_result(nlayers, nnodes, activations, derivs, mse, r2):
    ret = {}
    model = {}
    model["nlayers"] = nlayers
    model["nnodes"] = nnodes
    model["activations"] = activations
    model["derivs"] = derivs
    ret["model"] = model
    ret["mse"] = mse
    ret["r2"] = r2
    return ret

def formulate_grid_result(learning_rate, batch_size, mse, r2):
    ret = {}
    params = {}
    params["learning_rate"] = learning_rate
    params["batch_size"] = batch_size
    ret["params"] = params
    ret["mse"] = mse
    ret["r2"] = r2
    return ret

In [None]:
model = random_search(data, y)

{'model': {'nlayers': 2, 'nnodes': [24, 15, 40, 1], 'activations': [<function relu at 0x7f321d41f400>, <function tanh at 0x7f321d42a048>, <function relu at 0x7f321d41f400>, <function relu at 0x7f321d41f400>], 'derivs': [<function relu_derivative at 0x7f321d41fe18>, <function tanh_derivative at 0x7f321d42a0d0>, <function relu_derivative at 0x7f321d41fe18>, <function relu_derivative at 0x7f321d41fe18>]}, 'mse': 0.002019139509255252, 'r2': -0.1251376385187679}
{'model': {'nlayers': 3, 'nnodes': [24, 30, 15, 20, 1], 'activations': [<function relu at 0x7f321d41f400>, <function sigmoid at 0x7f321d41fea0>, <function tanh at 0x7f321d42a048>, <function sigmoid at 0x7f321d41fea0>, <function relu at 0x7f321d41f400>], 'derivs': [<function relu_derivative at 0x7f321d41fe18>, <function sigmoid_derivative at 0x7f321d41ff28>, <function tanh_derivative at 0x7f321d42a0d0>, <function sigmoid_derivative at 0x7f321d41ff28>, <function relu_derivative at 0x7f321d41fe18>]}, 'mse': 0.0021092775500712447, 'r2':

## 3. Optimize final model

In [32]:
nn = NeuralNetwork(nlayers=1, nnodes=[24,20,1], activations=[relu,sigmoid,relu])
nn.gradient_descent(data,y,0.001,100,[relu_derivative,sigmoid_derivative,relu_derivative],0.00000001)

MSE = 0.0016641961140239996
MSE = 0.0015084091124151662
MSE = 0.00127252048509998
MSE = 0.0010310213345415958
MSE = 0.0008381873175138492
MSE = 0.0007042955911045414
MSE = 0.0006180388486895496
MSE = 0.000564223313480639
MSE = 0.000530704366050108
MSE = 0.0005093730278224895
MSE = 0.0004952498343780691
MSE = 0.0004854002835832352
MSE = 0.0004781261258760767
MSE = 0.00047244791403138343
MSE = 0.00046779621651777135
MSE = 0.00046383382340152896
MSE = 0.00046035529635739584
MSE = 0.000457230735758699
MSE = 0.0004543743727156162
MSE = 0.000451726957656301
MSE = 0.0004492457792425078
MSE = 0.00044689890185746045
MSE = 0.0004446617458038312
MSE = 0.0004425149819328598
MSE = 0.00044044317688094424
MSE = 0.0004384338784378596
MSE = 0.00043647696844846995
MSE = 0.00043456418571033575
MSE = 0.0004326887623460601
MSE = 0.0004308451397332488
MSE = 0.0004290287427200345
MSE = 0.00042723579809596164
MSE = 0.00042546318757691136
MSE = 0.0004237083282160365
MSE = 0.000421969074880261
MSE = 0.000420243

1.4572546201332004e-05