In [1]:
import numpy as np
from proj1_helpers import *
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sweetviz as sv

def gaussian_scaling(x):
    mean_ = np.mean(x, axis=0)
    x_scaled = x - mean_
    std_ = np.std(x, axis=0)
    x_scaled = x_scaled / std_
    return x_scaled, mean_, std_


def col_na_omit(x, tol=1):
    index = sum(np.isnan(x))/(x.shape[0]) > tol
    x = x[:, ~index]
    return x


def row_na_omit(y, x):
    index = np.isnan(x).any(axis=1)
    x_no_na = x[~index]
    y_no_na = y[~index]
    return y_no_na, x_no_na


def outliers(y, x, clean_data, quantile=3):
    print(f"*****************************************************")
    print(f"                                                     ")
    print(f"WARNING : Outliers must be removed before imputation.")
    print(f"                                                     ")
    print(f"IGNORE : RuntimeWarning: invalid value encountered...",
          "\n", "This is due to NAs in the data set.")
    print(f"                                                     ")
    print(f"*****************************************************")
    m = np.mean(clean_data, axis=0)
    s = np.std(clean_data, axis=0)
    lower = m - quantile * s
    upper = m + quantile * s

    for i in tqdm(range(clean_data.shape[1])):

        index = clean_data[:, i] < lower[i]
        clean_data = np.delete(clean_data, index*1, axis=0)

        index = x[:, i] < lower[i]
        x = np.delete(x, index*1, axis=0)
        y = np.delete(y, index*1, axis=0)

        index = clean_data[:, i] > upper[i]
        clean_data = np.delete(clean_data, index*1, axis=0)

        index = x[:, i] > upper[i]
        x = np.delete(x, index*1, axis=0)
        y = np.delete(y, index*1, axis=0)

    return y, x, clean_data


def mean_imputation(na_data, clean_data):
    null, mean_x, std_x = gaussian_scaling(clean_data)
    na_data = (na_data - mean_x)/std_x
    na_data = np.nan_to_num(na_data, nan=0)
    return(na_data)


def median_imputation(na_data, clean_data):
    null, mean_x, std_x = gaussian_scaling(clean_data)
    na_data = (na_data - mean_x)/std_x
    median = np.median(null, axis=0)
    for i in range(na_data.shape[1]):
        na_data[:, i] = np.nan_to_num(na_data[:, i], nan=median[i])
    return(na_data)


def skewness(x):
    mean_x = np.mean(x, axis=0)
    std_x = np.std(x, axis=0)
    kurt = np.mean(((x - mean_x)/std_x)**3, axis=0)
    return kurt


def min_max_scaling(x):
    min_ = np.min(x, axis=0)
    max_ = np.max(x, axis=0)
    x_scaled = (x - min_)/(max_ - min_)
    return x_scaled, min_, max_


def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te


def pairwise(p, q):
    return np.sqrt(np.sum((p[:, np.newaxis, :]-q[np.newaxis, :, :])**2, axis=2))


def random_sample(x, length):
    num_row = x.shape[0]
    indices = np.random.permutation(num_row)
    sample = x[indices][:length]
    return sample


def stochastic_kmeans_imputation(na_data, clean_data, neighbors=10, length=1000):
    clean_data, mean_x, std_x = gaussian_scaling(clean_data)
    na_data = (na_data - mean_x)/std_x
    sample = random_sample(x=clean_data, length=length)
    for i in tqdm(range(na_data.shape[0])):
        condition = np.isnan(na_data[i, :])
        if len(condition) > 0:
            index = np.where(condition)
            candidate = np.delete(na_data[i], index)
            neighborhood = np.delete(sample, index, axis=1)
            distances = pairwise(candidate.reshape(
                (1, len(candidate))), neighborhood)
            nearest_index = np.argsort(distances)[0][:neighbors]
            na_data[i, index] = np.mean(sample[nearest_index], axis=0)[index]
    return(na_data)


def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]


def sigmoid(t):
    return 1.0 / (1 + np.exp(-t))


def calculate_loss(y, tx, w):
    pred = sigmoid(tx.dot(w))
    loss = (1/len(y)) * y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return (1/len(y))*np.squeeze(- loss)


def calculate_gradient(y, tx, w):
    y = np.array(y)
    tx = np.array(tx)
    w = np.array(w)
    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return (1/len(y))*grad


def gradient_descent(y, tx, w, max_iter, gamma, eps=1e-4):
    counter = 0
    grad = calculate_gradient(y, tx, w)
    b = np.linalg.norm(grad)
    while b > eps:
        counter += 1
        grad = calculate_gradient(y, tx, w)
        w -= gamma * grad
        b = np.linalg.norm(grad)
        print(f"Gradient norm = {b}\r", end="")
        if counter == max_iter:
            print("reached max_iter")
            break
    return calculate_loss(y, tx, w), w, np.linalg.norm(grad)


def stochastic_gradient_descent(y, tx, w, batch_size, max_iter, gamma, eps=1e-4):
    counter = 0
    grad = calculate_gradient(y, tx, w)
    b = np.linalg.norm(grad)
    while b > eps:
        for y_batch, tx_batch in batch_iter(y, tx, batch_size=batch_size, num_batches=1):
            # compute a stochastic gradient and loss
            grad = calculate_gradient(y_batch, tx_batch, w)
            b = np.linalg.norm(grad)
            # update w through the stochastic gradient update
            w = w - gamma * grad
            #print(f"Gradient norm = {b}\r", end="")
            counter += 1
        if counter == max_iter:
            print("reached max_iter")
            break
    return calculate_loss(y, tx, w), w, np.linalg.norm((1/len(y))*calculate_gradient(y, tx, w))


def is_cat(x, length=10):
    """Check if an array is categorical"""
    boolean_index = list([])
    if x.shape == (len(x),):
        if len(set(x)) < length:
            boolean = True
        else:
            boolean = False
            boolean_index.append(boolean)
    else:
        for i in range(x.shape[1]):
            if len(set(x[:, i])) < length:
                boolean = True
            else:
                boolean = False
            boolean_index.append(boolean)
    return np.array(boolean_index)


def build_poly(x, degree, pairwise_interaction=True, intercept=False):
    null, x_rNa = row_na_omit(x[:, 0], x)
    cat_index = is_cat(x_rNa)
    categorical_variables = x[:, cat_index]
    continuous_variables = x[:, ~cat_index]
    augmented_x = continuous_variables
    if degree > 1:
        for i in range(2, degree+1):
            augmented_x = np.c_[augmented_x, np.power(continuous_variables, i)]
    if pairwise_interaction:
        for j in tqdm(range(continuous_variables.shape[1])):
            for k in range(continuous_variables.shape[1]):
                if j >= k:
                    continue
                else:
                    augmented_x = np.c_[augmented_x, np.multiply(
                        continuous_variables[:, j], continuous_variables[:, k])]
    if intercept:
        inter = np.ones((x.shape[0], 1))
        augmented_x = np.c_[np.ones((x.shape[0], 1)), augmented_x]
    augmented_x = np.c_[augmented_x, categorical_variables]
    return augmented_x

def threshold(y, fitted_probabilities, step = 0.01):
    """find the best threshold for classification"""
    candidates = np.arange(0.2, 0.8, step)
    thresholds = list([])
    accuracies = list([])
    for i in tqdm(candidates):
        prediction = (fitted_probabilities>i)*1
        accuracy = 1 - sum(np.abs(prediction - y))/len(y)
        thresholds.append(i)
        accuracies.append(accuracy)
    index = accuracies.index(max(accuracies))
    return thresholds[index]

# Pen reg functions

In [2]:
def logit_loss(y, tx, w, lambda_=0):
    """ Log-loss for logistic regression """
    return np.sum(np.log(1. + np.exp(tx.dot(w))) - y * tx.dot(w)) + lambda_ * w[np.newaxis, :].dot(w[:, np.newaxis])[0, 0]


def sigmoid(x):
    """ Logistic function """
    return 1 / (1 + np.exp(-x))


def logit_gradient(y, tx, w, lambda_=0):
    """ Gradient for logistic loss """
    return (1/len(y))*tx.T.dot(sigmoid(tx.dot(w)) - y) + 2. * lambda_ * w


def logit_stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma, lambda_=0, batch_size=1, eps=1e-4, modulo=100):
    """ Stochastic gradient descent algorithm """
    for n_iter in range(max_iters):
        indices = np.random.permutation(y.shape[0])
        stochastic_tx = tx[indices[:batch_size], :]
        stochastic_y = y[indices[:batch_size]]
        grad = logit_gradient(stochastic_y, stochastic_tx, initial_w, lambda_)
        initial_w = initial_w - gamma * grad

        if (n_iter % modulo) == 0:
            full_grad = logit_gradient(y, tx, initial_w, lambda_)
            full_norm = np.linalg.norm(full_grad, np.inf)
            print(f"Gradient norm = {full_norm}\r", end="")
            if full_norm < eps:
                print(f"                                                \r", end="")
                print(f"Stopping criteria reached")
                break

    loss = logit_loss(y, tx, initial_w, lambda_)
    w = initial_w
    grad_norm = np.linalg.norm(logit_gradient(
        y, tx, initial_w, lambda_), np.inf)

    return loss, w, grad_norm


def logit_gradient_descent(y, tx, initial_w, max_iters, gamma, lambda_=0, eps=1e-4):
    """ logit gradient descent algorithm """

    for n_iter in range(max_iters):

        grad = logit_gradient(y, tx, initial_w, lambda_)
        initial_w = initial_w - gamma * grad
        grad_norm = max(abs(grad))
        print(
            f"Gradient norm = {round(grad_norm, 5)} --- {round((n_iter/(max_iters+1))*100, 2)}%         \r", end="")

        if grad_norm < eps:
            print(f"                                                \r", end="")
            print(f"Stopping criteria reached")
            break

    loss = logit_loss(y, tx, initial_w, lambda_)
    w = initial_w

    return loss, w, grad_norm


def compute_mse_loss(y, tx, w):
    """ Calculate the MSE loss """
    return np.sum((y - tx.dot(w))**2) / (2. * y.shape[0])


def compute_mae_loss(y, tx, w):
    """ Calculate the MAE loss """
    return np.sum(np.abs(y - tx.dot(w)) / (2. * y.shape[0]))


def compute_logistic_loss(y, tx, w, lambda_=0):
    """ Log-loss for logistic regression """
    return np.sum(np.log(1. + np.exp(tx.dot(w))) - y * tx.dot(w)) + \
        lambda_ * w[np.newaxis, :].dot(w[:, np.newaxis])[0, 0]


def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    """ Linear regression using gradient descent """
    return stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma,
                                       compute_mse_loss, compute_mse_gradient)


def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    """ Linear regression using stochastic gradient descent (mini-batch-size 1) """
    return stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma,
                                       compute_mse_loss, compute_mse_gradient,
                                       batch_size=1)


def least_squares(y, tx):
    """ Least squares regression using normal equations """
    try:
        w = np.linalg.solve(tx.T.dot(tx), tx.T.dot(y))
    except np.linalg.linalg.LinAlgError:
        w = np.linalg.lstsq(tx.T.dot(tx), tx.T.dot(y))[0]
    loss = compute_mse_loss(y, tx, w)
    return w, loss


def ridge_regression(y, tx, lambda_):
    """ Ridge regression using normal equations """
    w = np.linalg.solve(tx.T.dot(tx) + lambda_ *
                        (2. * y.shape[0]) * np.eye(tx.shape[1]), tx.T.dot(y))
    loss = compute_mse_loss(y, tx, w) + \
        lambda_ * w[np.newaxis, :].dot(w[:, np.newaxis])[0, 0]
    return w, loss


def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """ Logistic regression using gradient descent """
    return stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma,
                                       compute_logistic_loss,
                                       compute_logistic_gradient)


def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    """ Regularized logistic regression using gradient descent """
    return stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma,
                                       compute_logistic_loss,
                                       compute_logistic_gradient, lambda_=lambda_)

In [3]:
y , x, ids = load_csv_data("data/train.csv")
x[np.where(x == -999)] = np.nan
y[np.where(y == -1)] = 0

In [4]:
y , x, ids = load_csv_data("data/train.csv")
x[np.where(x == -999)] = np.nan
y[np.where(y == -1)] = 0

y_rNA, x_rNA = row_na_omit(y, x)

x_median = median_imputation(x, x_rNA)

x_augmented = build_poly(x = x_median, degree = 5, pairwise_interaction = True)
x_rNA_augmented = build_poly(x = x_rNA, degree = 5, pairwise_interaction = True)


null, mean_, std_ = gaussian_scaling(x_rNA_augmented)
x_augmented = (x_augmented - mean_) / std_

def eigen(A):
    """Simple eigenvalue decomposition and sorting by importance"""
    eigenValues, eigenVectors = np.linalg.eig(A)
    idx = np.argsort(eigenValues)
    eigenValues = eigenValues[idx]
    eigenVectors = eigenVectors[:,idx]
    return (eigenValues, eigenVectors)

cov = np.cov(x_augmented, rowvar=False)
eigenvalues, cov_eigenvectors = eigen(cov)

x_ort = x_augmented.dot(cov_eigenvectors.real[:,sum(eigenvalues<1e-12):len(eigenvalues)])

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [5]:
Penalty = 4.892e-4

w_OLS, loss = ridge_regression(y = y, tx = x_ort, lambda_ = Penalty)

In [7]:
loss, w1, grad_norm = logit_gradient_descent(y = y,
                                            tx = x_ort,
                                            initial_w = w_OLS,
                                            max_iters = 100000,
                                            gamma = 0.01,
                                            lambda_ = Penalty,
                                            eps = 0.05)

Stopping criteria reached                       


In [8]:
loss, w2, grad_norm = logit_gradient_descent(y = y,
                                            tx = x_ort,
                                            initial_w = w1,
                                            max_iters = 100000,
                                            gamma = 0.01,
                                            lambda_ = Penalty,
                                            eps = 0.04)

Stopping criteria reached                       


In [9]:
loss, w3, grad_norm = logit_gradient_descent(y = y,
                                            tx = x_ort,
                                            initial_w = w2,
                                            max_iters = 100000,
                                            gamma = 0.01,
                                            lambda_ = Penalty,
                                            eps = 0.03)

Stopping criteria reached                       


In [10]:
loss, w4, grad_norm = logit_gradient_descent(y = y,
                                            tx = x_ort,
                                            initial_w = w3,
                                            max_iters = 100000,
                                            gamma = 0.01,
                                            lambda_ = Penalty,
                                            eps = 0.02)

Stopping criteria reached                       


In [11]:
loss, w5, grad_norm = logit_gradient_descent(y = y,
                                            tx = x_ort,
                                            initial_w = w4,
                                            max_iters = 100000,
                                            gamma = 0.01,
                                            lambda_ = Penalty,
                                            eps = 0.01)

Stopping criteria reached                       


In [12]:
loss, w6, grad_norm = logit_gradient_descent(y = y,
                                            tx = x_ort,
                                            initial_w = w5,
                                            max_iters = 100000,
                                            gamma = 0.01,
                                            lambda_ = Penalty,
                                            eps = 0.001)

Gradient norm = 0.00154 --- 9.97%         

KeyboardInterrupt: 

In [None]:
Wopt = pd.DataFrame(w6)
Wopt

In [None]:
Wopt.to_csv("Wopt_ort_aug_clean_data_pol5_pairwise.csv", index = False)

In [14]:
W = [w_OLS, w1, w2, w3, w4, w5]
for c, w in enumerate(W):
    thresh = threshold(y, sigmoid(x_ort@w))
    pred = sigmoid(x_ort@w)
    pred = (pred>thresh)*1
    print(f"{c} -- In sample performance : {1 - sum(np.abs(pred - y))/len(y)}")

HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))


0 -- In sample performance : 0.764552


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))


1 -- In sample performance : 0.728728


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))


2 -- In sample performance : 0.72966


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))


3 -- In sample performance : 0.733032


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))


4 -- In sample performance : 0.7398359999999999


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))


5 -- In sample performance : 0.74942


In [16]:
B = np.genfromtxt("data/W.csv", delimiter=",", skip_header=1)

In [19]:
x_ort.shape

(250000, 484)

In [18]:
thresh = threshold(y, sigmoid(x_ort@B))
pred = sigmoid(x_ort@B)
pred = (pred>thresh)*1
print(f"{c} -- In sample performance : {1 - sum(np.abs(pred - y))/len(y)}")

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 533 is different from 484)