# 1) Preprocessing

## Categorizing Jets

## Dropping Noisy Features

## Dropping features using Density Plots

## Dropping features with zero-variance

## Standardize Data with Median

In [None]:
def standardize(x):
    """Standardize the original data set."""
    median_x = np.median(x)
    x = x - median_x
    std_x = np.std(x)
    x = x / std_x
    return x, median_x, std_x

# 2) Processing Data

## Load Data

In [1]:
def load_data(train_path, test_path):
    data_train = np.genfromtxt(train_path, delimiter=',', dtype='str', skip_header=1, usecols=[])
    data_test = np.genfromtxt(test_path, delimiter=',', dtype='str', skip_header=1, usecols=[])
    
    x_train = data_train[:, 2:].astype(np.float)
    x_test = data_test[:, 2:].astype(np.float)
    
    y_train = data_train[:, 1]
    y_train = np.where(y_train =='s', 1, y_train)
    y_train = np.where(y_train =='b', 0, y_train).astype(np.float)
    y_train = np.reshape(y_train, (y_train.shape[0], 1))
    
    id_train = data_train[:, 0].astype(np.float)
    id_train = np.reshape(id_train, (id_train.shape[0], 1))
    id_train = id_train
    id_test = data_test[:, 0].astype(np.float)
    id_test = np.reshape(id_test, (id_test.shape[0], 1))
    id_test = id_test
    
    return x_train, x_test, y_train, id_train, id_test

## Split Ratio

In [None]:
def create_local_test_set(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    # set seed
    np.random.seed(seed)
    # ***************************************************
    # split the data based on the given ratio: 
    # ***************************************************

    split_index = int(len(x)*ratio)
    
    indices = np.random.permutation(len(x))
    indices_train = indices[:split_index]
    indices_test = indices[split_index:]
    
    train_x = x[indices_train]
    train_y = y[indices_train]
    our_test_x = x[indices_test]
    our_test_y = y[indices_test]
    
    return train_x, train_y, our_test_x, our_test_y

## Feature Expansion

In [None]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    phi = np.ones((x.shape[0],degree+1))
    
    for i in range(degree+1):
        power_column = np.power(x,i)
        phi[:,i] = power_column
    return phi

## Regularization

## Cross Validation

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    x_test = x[k_indices[k]]
    y_test = y[k_indices[k]]
    train_indices = np.ndarray.flatten(np.delete(k_indices, k, axis=0))
    x_train = x[train_indices]
    y_train = y[train_indices]
    
    Fi_train = build_polys(x_train, degree)
    Fi_test = build_polys(x_test, degree)
    
    weights,  MSE_tr = ridge_regression(y_train, Fi_train, lambda_)
    weights_te,  MSE_te = ridge_regression(y_test, Fi_test, lambda_)
    
    loss_tr = compute_rmse(y_train, Fi_train, weights)
    loss_te = compute_rmse(y_test, Fi_test, weights)
    return loss_tr, loss_te

In [None]:
seed = 1
degrees = 7 # Range
k_fold = 4
lambdas = np.logspace(-4, 0, 30)
# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)
# define lists to store the loss of training data and test data
rmse_tr_f = []
rmse_te_f = []
for ind, lambda_ in enumerate(lambdas):
    for degree in degrees:
        rmse_tr = []
        rmse_te = []
        for k in range(k_fold):
            rmse_train, rmse_test = cross_validation(y, x, k_indices, k, lambda_, degree)
            rmse_tr.append(rmse_train)
            rmse_te.append(rmse_test)
    rmse_tr_f.append(np.mean(rmse_tr))
    rmse_te_f.append(np.mean(rmse_te))

# 3) Apply Regularized Logistic Regression

In [None]:
def calculate_regularized_loss(y, tx, w, lambda_):
    """compute the loss: negative log likelihood."""
    t = np.log(1+np.exp(tx@w)) - y*(tx@w)
    return np.sum(t) + lambda_*(np.linalg.norm(w))**2/2

def calculate_regularized_gradient(y, tx, w, lambda_):
    return tx.T@(sigmoid(tx@w)-y) + lambda_*w

def calculate_regularized_hessian(y, tx, w, lambda_):
    S = np.diag(np.diag(sigmoid(tx@w)@(1-sigmoid(tx@w)).T))
    return tx.T@S@tx + lambda_

def regularized_logistic_regression(y, tx, w, lambda_):
    """return the loss, gradient, and Hessian."""
    loss = calculate_regularized_loss(y, tx, w, lambda_)
    gradient = calculate_regularized_gradient(y, tx, w, lambda_)
#     hesh = calculate_regularized_hessian(y, tx, w, lambda_)
    return loss, gradient

def learning_by_regularized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss, gradient = regularized_logistic_regression(y, tx, w, lambda_)
    w = w - gamma*gradient
    return loss, w

In [None]:
def reg_logistic_regression(y, tx, lambda_ , w_initial, max_iters, gamma):
    w = w_initial
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_regularized_gradient(y, tx, w, gamma, lambda_)
    return loss, w

In [None]:
max_iter = 100
threshold = 0.001
gamma = 0.001
lambda_ = 0.1
losses = []
w = np.zeros((tx_train.shape[1], 1))
loss, weights = reg_logistic_regression(y, tx, lambda_, w, max_iter, gamma)

# 4) Predict Labels

In [None]:
def our_predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1
    return y_pred

In [None]:
y_predict = predict_labels(best_w, tx_test)
our_y_predict = our_predict_labels(best_w, our_tx_test)

## Calculate OUR Accuracy

In [None]:
true_pred = np.where()
accuracy = (true_num/total_num)*100

# 5) CSV Submission

In [None]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
import csv
create_csv_submission(id_test,y_predict,"fourth try")