In [1]:
import numpy as np
from implementations import *
from cross_validation import *
from data_preprocessing import *
from proj1_helpers import *
import math

In [84]:
def jet_qui_marche(x):
    """
    Returns value corresponding to the 23 columns ( jet value
    of 0, 1, 2 and 3 ).
    """
    jet_set = {
        0: x[:, 22] == 0,
        1: x[:, 22] == 1,
        2: np.logical_or(x[:, 22] == 2,x[:, 22] == 3)
        }
    
    return jet_set


def remove_features_qui_marche(tX):
    """ Deletes columns that are entirely filled with aberrant values"""
    tX = np.delete(tX,(22), axis=1)
    tX = tX[:, ~(tX == tX[0,:]).all(0)]
    return tX


def replace_aberrant_values_qui_marche(tX):
    '''Replaces the aberrant value (-999) for a given feature 
     by the mean observed value of that feature.'''
    tX_repl_feat = np.copy(tX)
    means = []
    
    #compute the mean of each feature (column) without taking -999 values into account
    for j in range(tX_repl_feat.shape[1]):
        m = tX_repl_feat[:,j][tX_repl_feat[:,j] != -999].mean()
        means.append(m)
        
    #change all -999 values of a column by the mean computed previously
    for i in range(len(means)):
        mask = tX_repl_feat[:, i] == -999
        tX_repl_feat[:, i][mask] = means[i]
    
    return tX_repl_feat


def replace_missing_data_mean_qui_marche(tx):
    """replace all -999 values by the mean of the current columns."""
    x = np.copy(tx)

    for i in range(x.shape[1]):
        if len(x[x[:, i] != -999, i]) == 0:
            mean = 0
        else:
            mean = x[x[:, i] != -999, i].mean()
        x[x[:, i] == -999, i] = mean
    return x


def standardize_qui_marche(x):

    centered_data = x - np.mean(x, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)

    return std_data


def leur_standardize_qui_marche(x):
    """ Standardize the original data set. """
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]

    return x


def preprocess_data(tX):
    
    tX = remove_features_qui_marche(tX)
    #print(tX)
    tX = replace_aberrant_values_qui_marche(tX)
    #print(tX)
    tX = leur_standardize_qui_marche(tX)
    #print(tX)
    
    return(tX)

def predict_labels_qui_marche(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred

def predict_merge(tX_test, weights, y_pred, indices):
    
    y_pred[indices] = predict_labels_qui_marche(weights, tX_test)
    print(y_pred)
    
    return y_pred


def build_poly_qui_marche(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree + 1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly


def ridge_regression_qui_marche(y, tx, lambda_):

    penalty = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1])
    
    w = np.linalg.solve(tx.T.dot(tx) + penalty, tx.T.dot(y))
    
    loss = compute_loss(y, tx, w)#+lambda_*np.linalg.norm(w)**2

    return w,loss

In [57]:
jet_tX = jet_qui_marche(tX)

In [69]:
See = preprocess_data(tX[jet_tX[1]])

[[ 1.60937e+02  6.87680e+01  1.03235e+02 ...  7.25000e-01  1.15800e+00
   4.62260e+01]
 [-9.99000e+02  1.62172e+02  1.25953e+02 ...  2.05300e+00 -2.02800e+00
   4.42510e+01]
 [ 1.54916e+02  1.04180e+01  9.47140e+01 ... -7.15000e-01 -1.72400e+00
   3.06380e+01]
 ...
 [-9.99000e+02  7.82560e+01  7.96990e+01 ... -8.52000e-01 -7.06000e-01
   7.89840e+01]
 [ 1.33457e+02  7.75400e+01  8.89890e+01 ... -1.23400e+00  2.52100e+00
   7.09690e+01]
 [ 1.05457e+02  6.05260e+01  7.58390e+01 ...  1.80000e+00 -1.66000e-01
   4.19920e+01]]
[[160.937       68.768      103.235      ...   0.725        1.158
   46.226     ]
 [122.18210934 162.172      125.953      ...   2.053       -2.028
   44.251     ]
 [154.916       10.418       94.714      ...  -0.715       -1.724
   30.638     ]
 ...
 [122.18210934  78.256       79.699      ...  -0.852       -0.706
   78.984     ]
 [133.457       77.54        88.989      ...  -1.234        2.521
   70.969     ]
 [105.457       60.526       75.839      ...   1.8       

In [64]:
print('loading training data'+"\n")
DATA_TEST_PATH = '../data/train.csv'
y,tX,ids = load_csv_data(DATA_TEST_PATH)
print('training data loaded'+"\n")

print('loading test data'+"\n")
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
print('test data loaded'+"\n")

loading training data

training data loaded

loading test data

test data loaded



In [86]:
print('loading training data'+"\n")
DATA_TEST_PATH = '../data/train.csv'
y,tX,ids = load_csv_data(DATA_TEST_PATH)
print('training data loaded'+"\n")

print('loading test data'+"\n")
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
print('test data loaded'+"\n")

#generate the dictionnaries for each jet number subset
jet_tX = jet_qui_marche(tX)
jet_tX_test = jet_qui_marche(tX_test)

lambdas = [0.00001, 0.001, 0.0001]
degrees = [11, 12, 12]

weights = []

y_pred = np.zeros(tX_test.shape[0])

#get the weights of ridge regression ran on each train subset
for i in range(len(jet_tX)):
    
    #preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    
    #build polynomial expansion on train subset
    tX_poly = build_poly_qui_marche(preprocessed_tX, degrees[i])
    
    #run ridge regression on train subset
    w, loss = ridge_regression_qui_marche(y[jet_tX[i]], tX_poly, lambdas[i])
    
    weights.append(w)
    
#generate the predictions on each test subset using the computed weight
for j in range(len(jet_tX_test)):
    
    #preprocess every test subset
    preprocessed_tX_test = preprocess_data(tX_test[jet_tX_test[j]])
    
    #build polynomial expansion on test subset
    tX_test_poly = build_poly_qui_marche(preprocessed_tX_test, degrees[j])
    
    #use weights to compute the predictions
    y_pred = predict_merge(tX_test_poly, weights[j], y_pred, jet_tX_test[j])
    
    
OUTPUT_PATH = '../data/submission_splitt.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

loading training data

training data loaded

loading test data

test data loaded

[-1.  0. -1. ...  1.  0. -1.]
[-1. -1. -1. ...  1. -1. -1.]
[-1. -1. -1. ...  1. -1. -1.]
