In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import csv
import math
import time
import pickle
import os
from tqdm import tqdm
from scipy.stats import loguniform
from scipy.special import xlogy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.gridspec as gridspec

In [None]:
# Plot settings
mpl.rcParams['axes.spines.left'] = True
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = True

In [None]:
def new_test(X, grad_check=False):
    
    np.random.seed(42)
    
    if grad_check == True:
        layers = [X.shape[1], 2, 1]
        params = parameter_init(layers, zero=False, he=True)
        
    else: 
        layers = [X.shape[1], 500, 500, 1]
        params = parameter_init(layers, zero=False, he=True)
        
    return params, layers

In [None]:
def sigmoid(z):
    return (1/(1 + np.exp(-z)))
def relu(z):
    return np.maximum(0, z)
def softmax(z):
    expo = np.exp(z)
    expo_sum = np.sum(np.exp(z))
    return expo/expo_sum
def inv_relu(z):
    return (z > 0) * 1
def inv_sigmoid(z):
    return np.multiply(sigmoid(z), 1 - sigmoid(z))

In [None]:
def encoder(y, binary=True):

    if binary == True:
        encoder = OneHotEncoder(drop='first', handle_unknown='error')
        
    else:
        encoder = OneHotEncoder(handle_unknown='error')

    y_enc = encoder.fit_transform(np.array(y).reshape(y.shape[0], 1)).toarray()

    return y_enc

In [None]:
def parameter_init(layers, zero=False, he=False):
    
    params={}

    for i in range(len(layers)-1):
        
        if zero == True:
            params['w' + str(i + 1)] = np.zeros((layers[i], layers[i + 1]))
            params['b' + str(i + 1)] = np.zeros((1, layers[i + 1]))

        else:
            params['w' + str(i + 1)] = np.random.randn(layers[i], layers[i + 1]) *0.01
            params['b' + str(i + 1)] = np.random.randn(1, layers[i + 1])

            if he == True:
                params['w' + str(i + 1)] *= np.sqrt(2/layers[i])

    return params

In [None]:
def dict_to_vec(dictionary):

    dict_vector = np.empty([0, 1])
    shapes = []

    regex_w = 'd{0,1}w'
    regex_b = 'd{0,1}b'

    ws = [v for k, v in dictionary.items() if re.match(regex_w, k)]
    bs = [v for k, v in dictionary.items() if re.match(regex_b, k)]

    for i in range(int(len(dictionary)/2)):
        w_shape = ws[i].shape
        b_shape = bs[i].shape

        w_vec = ws[i].flatten('F').reshape(-1,1)
        b_vec = bs[i].flatten('F').reshape(-1,1)

        dict_vector = np.concatenate([dict_vector, w_vec, b_vec])

        shapes.append((w_shape, b_shape))

    return dict_vector, shapes

In [None]:
def vec_to_dict(vector, param_shapes, params):
    
    new_dict ={}
    
    dictionary = dict.copy(params)

    for i in range(1, int(len(params) / 2 + 1)):
        w_shape = param_shapes[i-1][0]
        w_count = w_shape[0] * w_shape[1]

        b_shape = param_shapes[i-1][1]
        b_count = b_shape[0] * b_shape[1]

        regex_w = f'd*w{i}$'
        regex_b = f'd*b{i}$'

        w_i = [k for k in dictionary.keys() if re.match(regex_w, k)][0]
        b_i = [k for k in dictionary.keys() if re.match(regex_b, k)][0]

        new_dict[w_i] = vector[0: w_count].reshape(w_shape, order = 'F')

        vector = vector[w_count:]

        new_dict[b_i] = vector[: b_count].reshape(b_shape, order = 'F')

        vector = vector[b_count:]

    return new_dict

In [None]:
def cost_func(A, y, lambda_value=0.1, regularization=False):
    
    reg_sums = 0
    m=y.shape[0]

    cost = (-1 / m) * np.sum(np.multiply(np.log(A), y) + np.multiply(np.log(1 - A), (1 - y)))

    if regularization == True:
        
        for i in range(1, int(len(params)/2+1)):
            reg_sums += np.sum(params['w' + str(i)]**2)

        l2_reg = cost + (lambda_value/(2*m)) * reg_sums
        
        return l2_reg

    else:
        return cost

In [None]:
# def cost_func(A, y, lambda_value=0.1):
#     reg_sums = 0
#     m=y.shape[0]

#     cost = (-1 / m) * np.sum(np.multiply(np.log(A), y) + np.multiply(np.log(1 - A), (1 - y)))
    
#     for i in range(1, int(len(params)/2+1)):
#         reg_sums += np.sum(params['w' + str(i)]**2)

#     cost += (lambda_value/(2*m)) * reg_sums
        

#     return l2_reg

In [None]:
def forward_prop_wrong(X, params, sig=True):
    
    cache=[]

    last = str(len(layers)-1)

    # First Layer
    w = params['w1']
    b = params['b1']
    z = np.matmul(X, w) + b

    A = relu(z)

    l = (w, b, z, A)

    cache.append(l)

    if len(layers) > 3: # todo this didn't seem to make a difference but still (but consider removing)

        for i in range(2,len(layers)-1):
            w = params['w' + str(i)]
            b = params['b' + str(i)]
            z = np.matmul(A, w) + b

            A = relu(z)

            l = (w, b, z, A)
            cache.append(l)


    # Last layer
    w = params['w' + last]
    b = params['b' + last]
    z = np.matmul(A, w) + b

    if sig == True:
        A = sigmoid(z)
        
    else:
        A = softmax(z)

    l = (w, b, z, A)

    cache.append(l)
    
    return A, cache

In [None]:
def forward_prop(X, y, params, lambda_value):
    
    cache=[]

    last = str(int(len(params)/2))
    
    m = X.shape[0]
    
    # First and mid layers
    for i in range(1,int(len(params)/2)):

        if i == 1:
            A = X.copy()

        A_prev = A.copy()

        w = params['w' + str(i)]
        b = params['b' + str(i)]

        z = np.matmul(A_prev, w) + b

        A = relu(z)

        l = (w, b, z, A)
        
        cache.append(l)

    # Last layer
    w = params['w' + last]
    b = params['b' + last]

    z = np.matmul(A, w) + b

    AL = sigmoid(z)

    l = (w, b, z, AL)

    cache.append(l)

    reg_sums = 0
    m=y.shape[0]
    
    cost_nonreg = (-1 / m) * np.sum(xlogy(y, AL) + xlogy(1 - y, 1 - AL))
    # compensating for times when AL = 0 or 1

    for i in range(1, int(len(params) / 2 + 1)):
        reg_sums += np.sum(params['w' + str(i)] ** 2)

    l2_reg = (lambda_value / (2 * m)) * reg_sums

    cost = cost_nonreg + l2_reg

    return AL, cache, cost

In [None]:
def back_prop_wrong(A, X, y, layers, cache, lambda_value=0.1, regularization=False):
    grads = {}
    cache_dict = {}
    m = y.shape[0]
    last = len(layers) - 1

    if regularization == True:
        switch = 1
    else:
        switch = 0

    for i in range(len(cache)):
        cache_dict['w' + str(i + 1)] = np.copy(cache[i][0])
        cache_dict['b' + str(i + 1)] = np.copy(cache[i][1])
        cache_dict['z' + str(i + 1)] = np.copy(cache[i][2])
        cache_dict['a' + str(i + 1)] = np.copy(cache[i][3])

    # Back prop for very last layer
    grads['dz' + str(last)] = A - y 

    grads['db' + str(last)] = (1 / m) * np.sum(grads['dz' + str(last)], axis=0, keepdims=True)
    grads['dw' + str(last)] = (1 / m) * np.matmul(cache_dict['a' + str(last - 1)].T, grads['dz' + str(last)]) + (
                (lambda_value / m) * cache_dict['w' + str(last)]) * switch


    # Back prop for middle layers
    for i in reversed(range(2, last)):
        grads['dz' + str(i)] = np.multiply(
            np.matmul(grads['dz' + str(i + 1)], cache_dict['w' + str(i + 1)].T),
            inv_relu(cache_dict['z' + str(i)]))
        grads['db' + str(i)] = (1 / m) * np.sum(grads['dz' + str(i)], axis=0, keepdims=True)
        grads['dw' + str(i)] = (1 / m) * np.matmul(cache_dict['a' + str(i - 1)].T, grads['dz' + str(i)]) + (
                    (lambda_value / m) * cache_dict['w' + str(i)]) * switch

    # Back prop for first layer
    grads['dz1'] = np.multiply(
        np.matmul(grads['dz2'], cache_dict['w2'].T),
        inv_relu(cache_dict['z1']))
    grads['db1'] = (1 / m) * np.sum(grads['dz1'], axis=0, keepdims=True)
    grads['dw1'] = (1 / m) * np.matmul(X.T, grads['dz1']) + ((lambda_value / m) * cache_dict['w1']) * switch

    for i in range(1, len(layers)):
        del (grads['dz' + str(i)])

    grads_new = {}
    for key in reversed(grads):
        grads_new[key] = grads[key]

    return grads_new 

In [None]:
def back_prop(AL, X, y, layers, cache, lambda_value): 
    
    grads = {}
    cache_dict = {}
    m = y.shape[0]
    last = len(layers) - 1

    for i in range(len(cache)):     
        cache_dict['w' + str(i + 1)] = np.copy(cache[i][0])
        cache_dict['b' + str(i + 1)] = np.copy(cache[i][1])
        cache_dict['z' + str(i + 1)] = np.copy(cache[i][2])
        cache_dict['a' + str(i + 1)] = np.copy(cache[i][3])

    # Back prop for last layer
    grads['dz' + str(last)] = AL - y
    grads['db' + str(last)] = (1 / m) * np.sum(grads['dz' + str(last)], axis=0, keepdims=True)
    grads['dw' + str(last)] = (1 / m) * np.matmul(cache_dict['a' + str(last - 1)].T, grads['dz' + str(last)]) + ((lambda_value / m) * cache_dict['w' + str(last)])

    # Back prop for middle layers
    for i in reversed(range(2, last)):   
        grads['dz' + str(i)] = np.multiply(np.matmul(grads['dz' + str(i + 1)], cache_dict['w' + str(i + 1)].T), inv_relu(cache_dict['z' + str(i)]))
        grads['db' + str(i)] = (1 / m) * np.sum(grads['dz' + str(i)], axis=0, keepdims=True)
        grads['dw' + str(i)] = (1 / m) * np.matmul(cache_dict['a' + str(i - 1)].T, grads['dz' + str(i)]) + ((lambda_value / m) * cache_dict['w' + str(i)])

    # Back prop for first layer
    grads['dz1'] = np.multiply(np.matmul(grads['dz2'], cache_dict['w2'].T),inv_relu(cache_dict['z1']))
    grads['db1'] = (1 / m) * np.sum(grads['dz1'], axis=0, keepdims=True)
    grads['dw1'] = (1 / m) * np.matmul(X.T, grads['dz1']) + ((lambda_value / m) * cache_dict['w1'])

    for i in range(1, len(layers)):
        del (grads['dz' + str(i)])

    grads_new = {}
    
    for key in reversed(grads):
        grads_new[key] = grads[key]

    return grads_new

In [None]:
def grad_check_wrong(X, A, y, params, grads, epsilon=1e-7):

    temp_params = params.copy()  

    temp_vector, shapes = dict_to_vec(temp_params)

    cost_plus = np.zeros(temp_vector.shape)
    cost_minus = np.zeros(temp_vector.shape)

    grads_approx = np.zeros(temp_vector.shape)

    grads_vector, _ = dict_to_vec(grads)
    grads_vector

    for i in range(len(temp_vector)):
        temp_vector_plus = np.copy(temp_vector)
        temp_vector_minus = np.copy(temp_vector)

        temp_vector_plus[i] += epsilon
        temp_vector_minus[i] -= epsilon

        temp_dict_plus = vec_to_dict(temp_vector_plus, shapes, params)
        temp_dict_minus = vec_to_dict(temp_vector_minus, shapes, params)

        A_plus, _ = forward_prop_wrong(X, temp_dict_plus, sig=True)
        A_minus, _ = forward_prop_wrong(X, temp_dict_minus, sig=True)

        cost_plus[i] = cost_func(A_plus, y, lambda_value =0.1)

        cost_minus[i] = cost_func(A_minus, y, lambda_value =0.1)

        grads_approx[i] = (cost_plus[i] - cost_minus[i]) / (2 * epsilon)

    num = np.linalg.norm(grads_approx - grads_vector)
    den = np.linalg.norm(grads_approx) + np.linalg.norm(grads_vector)
    diff = num / den

    if diff > epsilon:
        print('Something might be wrong!')
        
    else:
        print('All good!')

    return diff

In [None]:
def grad_check(X, y, params, grads, lambda_value, epsilon=1e-7):
    
    temp_params = params.copy()

    params_vector, param_shapes = dict_to_vec(temp_params)

    grads_approx = np.zeros(params_vector.shape)

    grads_vector, _ = dict_to_vec(grads)
    
    cost_plus = np.zeros(params_vector.shape)
    cost_minus = np.zeros(params_vector.shape)

    for i in tqdm(range(len(params_vector)), desc='grad check', colour='black'):
        params_plus = np.copy(params_vector)
        params_minus = np.copy(params_vector)

        params_plus[i] += epsilon
        params_minus[i] -= epsilon

        params_dict_plus = vec_to_dict(params_plus, param_shapes, params)
        _,_, cost_plus[i] = forward_prop(X,y, params_dict_plus, lambda_value)

        params_dict_minus = vec_to_dict(params_minus, param_shapes, params)
        _, _, cost_minus[i] = forward_prop(X,y, params_dict_minus, lambda_value)

        grads_approx[i] = (cost_plus[i] - cost_minus[i]) / (2 * epsilon)

    num = np.linalg.norm(grads_approx - grads_vector)
    den = np.linalg.norm(grads_approx) + np.linalg.norm(grads_vector)
    diff = num / den

    if diff > epsilon:
        print('Something might be wrong!')
        
    else:
        print(f'All good! Difference is = {diff}')
        
    return diff, num, den

In [None]:
def update_params(params, grads, learning_rate=0.01):

    new_params = {}

    for i in range(1, int(len(params)/2 + 1)):
        new_params['w' + str(i)] = params['w' + str(i)] - learning_rate * grads['dw' + str(i)]
        new_params['b' + str(i)] = params['b' + str(i)] - learning_rate * grads['db' + str(i)]

    return new_params

In [None]:
def predict(X, y, parameters, lambda_value, threshold=0.5):
    
    A, _, cost = forward_prop(X, y, parameters, lambda_value)
    
    #https://stackoverflow.com/questions/56321906/make-all-the-elemant-zero-except-max-n-element-in-each-row-in-numpy-2d-array
    n_max = 1
    
    A = A * (A >= np.sort(A, axis=1)[:, [-n_max]]).astype(int)
    
    outcome = (A>threshold)*1
    accuracy = np.sum(y == outcome)/(len(y))

    return outcome, accuracy, cost

In [None]:
def f_score(outcome, y):

    # false neg
    false_neg = ((y == 1) & (outcome == 0)).sum()
    # false pos
    false_pos = ((y == 0) & (outcome == 1)).sum()
    # true pos
    true_pos = ((y == 1) & (outcome == 1)).sum()

    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)

    score = 2 * ((precision * recall) / (precision + recall))

    return score, precision, recall

In [None]:
def adam_init(params):
    
    v = {}
    s = {}

    for i in range(1, int(len(params)/2)+1):
        v['dw' + str(i)] = np.zeros((params["w" + str(i)].shape[0], params["w" + str(i)].shape[1]))
        v['db' + str(i)] = np.zeros((params["b" + str(i)].shape[0], params["b" + str(i)].shape[1]))

        s['dw' + str(i)] = np.zeros((params["w" + str(i)].shape[0], params["w" + str(i)].shape[1]))
        s['db' + str(i)] = np.zeros((params["b" + str(i)].shape[0], params["b" + str(i)].shape[1]))

    return v,s

In [None]:
def update_params_adam(params, grads, v, s, t, learning_rate=0.01, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
    
    v_adjusted = {}
    s_adjusted = {}

    for i in range(1, int(len(params)/2)+1):
        v['dw' + str(i)] = beta1 * v['dw' + str(i)] + (1 - beta1) * grads['dw' + str(i)]
        v['db' + str(i)] = beta1 * v['db' + str(i)] + (1 - beta1) * grads['db' + str(i)]

        v_adjusted["dw" + str(i)] = v['dw' + str(i)] / (1 - np.power(beta1, t))
        v_adjusted["db" + str(i)] = v['db' + str(i)] / (1 - np.power(beta1, t))

        s["dw" + str(i)] = beta2 * s['dw' + str(i)] + (1 - beta2) * np.power(grads['dw' + str(i)], 2)
        s["db" + str(i)] = beta2 * s['db' + str(i)] + (1 - beta2) * np.power(grads['db' + str(i)], 2)

        s_adjusted["dw" + str(i)] = s["dw" + str(i)] / (1 - np.power(beta2, t))
        s_adjusted["db" + str(i)] = s["db" + str(i)] / (1 - np.power(beta2, t))

        params["w" + str(i)] = params["w" + str(i)] - (
                    learning_rate * v_adjusted["dw" + str(i)] / (np.sqrt(s_adjusted["dw" + str(i)]) + epsilon))
        params["b" + str(i)] = params["b" + str(i)] - (
                    learning_rate * v_adjusted["db" + str(i)] / (np.sqrt(s_adjusted["db" + str(i)]) + epsilon))

    return params

In [None]:
def batch_maker(X, y, batch_size, seed):

    np.random.seed(seed)

    m = X.shape[0]
    perm = np.random.permutation(m)

    X_shuffled = X[perm, :]
    y_shuffled = y[perm, :]

    all_batches = []

    for i in range(0, math.floor(m / batch_size)):
        batch_X = X_shuffled[i * batch_size: (i + 1) * batch_size, :]
        batch_y = y_shuffled[i * batch_size: (i + 1) * batch_size, :]

        batch_pair = (batch_X, batch_y)

        all_batches.append(batch_pair)

    if m % batch_size != 0:
        batch_X = X_shuffled[math.floor(m / batch_size) * batch_size:, :]
        batch_y = y_shuffled[math.floor(m / batch_size) * batch_size:, :]

        batch_pair = (batch_X, batch_y)

        all_batches.append(batch_pair)

    return all_batches

In [None]:
def update_lr_exp(learning_rate, epoch_num, decay_rate):

    new_learning_rate = (1 / (1 + decay_rate * epoch_num)) * learning_rate

    return new_learning_rate


def update_lr_time(learning_rate, epoch_num, decay_rate, interval):

    new_learning_rate = (1 / (1 + decay_rate * math.floor(epoch_num / interval))) * learning_rate

    return new_learning_rate