In [2]:
import numpy as np
import matplotlib.pyplot as plt
from extra_helpers import *
from feature_importance import *
from proj1_helpers import *
from data_processing import *
from implementations import *
from objective_functions import *
from run_functions import *

## Removing constant features and imputing

In [3]:
## 1 - Data importation.
y, tX, ids = load_csv_data("data/train.csv")

## 2 - Changing "-1" to "0" in the response vector
##     to be in phase with the major part of the scientific literature.
y[np.where(y == -1)] = 0

## 3 - Sending the categorical feature (PRI_jet_num) as the last column of the data matrix
##     using the rearrange_continuous_categorical_features() function.
tX = rearrange_continuous_categorical_features(tX)

## 1 - Knowing that the last column is the categorical feature, we isolate this column
##     and identify the indexes for each cases, i.e. PRI_jet_num = 0, 1, 2 or 3.
categories = tX[:, -1]
zeros_index = np.where(categories == 0)[0]
one_index = np.where(categories == 1)[0]
two_index = np.where(categories == 2)[0]
three_index = np.where(categories == 3)[0]

zeros = tX[zeros_index, :]
y_zero = y[zeros_index]
null_var_index_zero = np.where(np.std(zeros, axis=0) == 0)[0]
zeros = np.delete(zeros, null_var_index_zero, axis=1)
zeros[np.where(zeros == -999)] = np.nan
zeros = median_imputation(zeros)

ones = tX[one_index, :]
y_one = y[one_index]
null_var_index_one = np.where(np.std(ones, axis=0) == 0)[0]
ones = np.delete(ones, null_var_index_one, axis=1)
ones[np.where(ones == -999)] = np.nan
ones = median_imputation(ones)

two = tX[two_index, :]
y_two = y[two_index]
null_var_index_two = np.where(np.std(two, axis=0) == 0)[0]
two = np.delete(two, null_var_index_two, axis=1)
two[np.where(two == -999)] = np.nan
two = median_imputation(two)

three = tX[three_index, :]
y_three = y[three_index]
null_var_index_three = np.where(np.std(three, axis=0) == 0)[0]
three = np.delete(three, null_var_index_three, axis=1)
three[np.where(three == -999)] = np.nan
three = median_imputation(three)

## Percentage of Higgs boson signals

In [7]:
print("Percentage of Higgs boson signals when PRI jet num = 0 : {}%".format(round(100*len(y_zero[np.where(y_zero == 1)])/len(y_zero), 2)))
print("")
print("Percentage of Higgs boson signals when PRI jet num = 1 : {}%".format(round(100*len(y_one[np.where(y_one == 1)])/len(y_one), 2)))
print("")
print("Percentage of Higgs boson signals when PRI jet num = 2 : {}%".format(round(100*len(y_two[np.where(y_two == 1)])/len(y_two), 2)))
print("")
print("Percentage of Higgs boson signals when PRI jet num = 3 : {}%".format(round(100*len(y_three[np.where(y_three == 1)])/len(y_three), 2)))

Percentage of Higgs boson signals when PRI jet num = 0 : 25.51%

Percentage of Higgs boson signals when PRI jet num = 1 : 35.73%

Percentage of Higgs boson signals when PRI jet num = 2 : 51.08%

Percentage of Higgs boson signals when PRI jet num = 3 : 30.37%


## Skewnesses

In [8]:
print(f"Skewness when PRI jet num = 0 : \n\n {skewness(zeros)}")
print("")
print(f"Skewness when PRI jet num = 1 : \n\n {skewness(ones)}")
print("")
print(f"Skewness when PRI jet num = 2 : \n\n {skewness(two)}")
print("")
print(f"Skewness when PRI jet num = 3 : \n\n {skewness(three)}")

Skewness when PRI jet num = 0 : 

 [ 3.32210970e+00  6.75710692e-01  3.00318472e+00  4.91895516e+01
 -7.14605318e-01  4.91895519e+01  4.65605776e+00  1.03876731e+00
  1.81590440e+00  4.12353948e+00  3.19948264e-02  2.14761578e-02
  2.85321143e+00  4.74951711e-02 -4.51553490e-02  2.76416777e+01
  2.29980818e-02  1.05309562e+00]

Skewness when PRI jet num = 1 : 

 [ 3.93743706e+00  1.36595160e+00  3.56960446e+00  2.90229506e+00
 -7.96101507e-02  2.10482402e+00  3.22864384e+00  2.07951502e+00
 -3.52990443e-01  3.30608143e+00  1.30196484e-02  1.40723733e-02
  2.72262348e+00  8.76503388e-03 -5.03918342e-02  2.80466277e+00
  9.49795155e-04  1.81118494e+00  3.33666979e+00  6.52163323e-03
  1.80845587e-02  3.33666972e+00]

Skewness when PRI jet num = 2 : 

 [ 4.70155483e+00  2.03161213e+00  4.62046124e+00  1.64401738e+00
  4.38950974e-01  2.32664185e+00 -2.60638936e-01  3.01166130e-01
  3.43361519e+00  2.11196422e+00  2.55295640e+00 -1.00600705e+00
 -7.39741802e-02  3.12669617e+00  7.88549329e

## Showing the dispersion of missing values across PRI jet num states

In [9]:
## 2 - We isolate the four different datasets corresponding to the different cases of PRI_jet_num.
##   - For each of them we transform all -999 values to np.nan. 
zeros = tX[zeros_index, :]
y_zero = y[zeros_index]
zeros[np.where(zeros == -999)] = np.nan

ones = tX[one_index, :]
y_one = y[one_index]
ones[np.where(ones == -999)] = np.nan

two = tX[two_index, :]
y_two = y[two_index]
two[np.where(two == -999)] = np.nan

three = tX[three_index, :]
y_three = y[three_index]
three[np.where(three == -999)] = np.nan

In [10]:
print(f"Missing values when PRI jet num = 0 : \n\n {sum(np.isnan(zeros))}")
print("")
print(f"Missing values when PRI jet num = 1 : \n\n {sum(np.isnan(ones))}")
print("")
print(f"Missing values when PRI jet num = 2 : \n\n {sum(np.isnan(two))}")
print("")
print(f"Missing values when PRI jet num = 3 : \n\n {sum(np.isnan(three))}")

Missing values when PRI jet num = 0 : 

 [26123     0     0     0 99913 99913 99913     0     0     0     0     0
 99913     0     0     0     0     0     0     0     0     0 99913 99913
 99913 99913 99913 99913     0     0]

Missing values when PRI jet num = 1 : 

 [ 7562     0     0     0 77544 77544 77544     0     0     0     0     0
 77544     0     0     0     0     0     0     0     0     0     0     0
     0 77544 77544 77544     0     0]

Missing values when PRI jet num = 2 : 

 [2952    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]

Missing values when PRI jet num = 3 : 

 [1477    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


## Showing that given a state of PRI jet num, we have different significant features

In [11]:
print(f"Significant features at 50% when PRI jet num = 0 : \n\n {significant_features(y_zero, zeros)}")
print("")
print(f"Significant features at 50% when PRI jet num = 1 : \n\n {significant_features(y_one, ones)}")
print("")
print(f"Significant features at 50% when PRI jet num = 2 : \n\n {significant_features(y_two, two)}")
print("")
print(f"Significant features at 50% when PRI jet num = 3 : \n\n {significant_features(y_three, three)}")

  test = t_mean/np.sqrt(t_var)


Significant features at 50% when PRI jet num = 0 : 

 [0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0]

Significant features at 50% when PRI jet num = 1 : 

 [0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 0]

Significant features at 50% when PRI jet num = 2 : 

 [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0]

Significant features at 50% when PRI jet num = 3 : 

 [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 0]
