In [1]:
import numpy as np
from helpers import *
import matplotlib.pyplot as plt
np.random.seed(0)

In [2]:
yb, input_data, ids = load_csv_data('train.csv', sub_sample=False)

In [3]:
yb

array([ 1., -1., -1., ...,  1., -1., -1.])

In [4]:
ids

array([100000, 100001, 100002, ..., 349997, 349998, 349999])

# Processing data
1. add bias in data
2. split column "mn ’PRI jet num'
3. replace -999 with average of whole train
4. split data 0.8:0.2
5. normalize
6. balance data

In [5]:
data = np.c_[np.ones((input_data.shape[0],1)),input_data]
jet_num = np.zeros((len(data),4))
for i in range(4):
    row = np.where(data[:,23] == i)[0]
    jet_num[row,i] = 1
data[:,:0]
data = np.c_[data[:,:23],data[:,24:]]
data = np.c_[data,jet_num]
# data[:5,21:25]

In [6]:
pos = np.where(data == -999)
data[data == -999] = np.nan
c_mean = np.nanmean(data,axis=0)
data[np.isnan(data)] = c_mean[pos[1]]

In [7]:
yb[yb>0] = 0
yb[yb<0] = 1
yb

array([0., 1., 1., ..., 0., 1., 1.])

In [8]:
# s = 0;b = 1 at first
# s = 1;b = -1 now from the helpers
num_samples = data.shape[0]

fraction_train = 0.8
np.random.seed(0)
rinds = np.random.permutation(num_samples)

d_train = data[rinds[:int(num_samples * fraction_train)]] 
yb_train = yb[rinds[:int(num_samples * fraction_train)]]  

d_test = data[rinds[int(num_samples * fraction_train):]] 
yb_test = yb[rinds[int(num_samples * fraction_train):]]  

In [9]:
d_train

array([[  1.        , 128.224     ,  76.169     , ...,   0.        ,
          0.        ,   0.        ],
       [  1.        , 121.85852836,  79.879     , ...,   0.        ,
          0.        ,   0.        ],
       [  1.        , 103.989     ,  60.866     , ...,   1.        ,
          0.        ,   0.        ],
       ...,
       [  1.        , 121.85852836,  67.554     , ...,   0.        ,
          0.        ,   0.        ],
       [  1.        , 125.188     ,  88.344     , ...,   0.        ,
          0.        ,   0.        ],
       [  1.        ,  72.741     ,  95.563     , ...,   1.        ,
          0.        ,   0.        ]])

In [10]:
# Normalize
mean = np.mean(d_test, axis=0)
sdt = np.std(d_test)
data_train = (d_train - mean) / sdt
data_test = ( d_test - mean) / sdt

#problem imbalance classes
#add additional weight to features which sample was represented in the minority
#np.count_nonzero(labels == 0) # 85667 - multiply weight for these samples to 2
#np.count_nonzero(labels == 1) # 164333 - multiply weight for these samples to 1

imbalance = yb_train.copy()
imbalance[np.where(imbalance == 0)] = 2
imbalance


array([2., 1., 1., ..., 1., 1., 1.])

## Train

In [11]:
def sigmoid(pred):
    """ Sigmoid function
    
    Args:
        pred (np.array): Input data of shape (N, ) 
        
    Returns:
        np.array: Probabilites of shape (N, ), where each value is in [0, 1].
    """
    pos = pred >= 0
    neg = pred < 0
    pred[pos] = 1 / (1 + np.exp(-pred[pos]))
    pred[neg] = 1 - 1 / (1 + np.exp(pred[neg]))
    return pred

def loss_logistic(data, labels, w, alpha): 
    """ Logistic regression loss function for binary classes
    
    Args:
        data (np.array): Dataset of shape (N, D).
        labels (np.array): Labels of shape (N, ).
        w (np.array): Weights of logistic regression model of shape (D, )
        alpha (int) : regularization factor
    Returns:
        int: Loss of logistic regression.
    """
    
    return np.mean((-np.log(sigmoid(-data@w)) - labels*data.dot(w))*imbalance) + (alpha/2)*np.linalg.norm(w)**2

def logistic_regression_classify(data, w):
    """ Classification function for binary class logistic regression. 
    
    Args:
        data (np.array): Dataset of shape (N, D).
        w (np.array): Weights of logistic regression model of shape (D, )
    Returns:
        np.array: Label assignments of data of shape (N, )
    """
    #### find predictions and threshold.
    predictions = sigmoid(data.dot(w)) 
    predictions[predictions<0.5]=0
    predictions[predictions>=0.5]=1        
    return predictions

def gradient_logistic(data, labels, w, alpha):
    """ Logistic regression gradient function for binary classes
    
    Args:
        data (np.array): Dataset of shape (N, D).
        labels (np.array): Labels of shape (N, ).
        w (np.array): Weights of logistic regression model of shape (D, )
        alpha (int) : regularization factor
    Returns:
        np. array: Gradient array of shape (D, )
    """
    return data.T.dot((sigmoid(data.dot(w))-labels)*imbalance) / data.shape[0] + alpha*w

def accuracy(labels_tr, labels_pred):
    """ Computes accuracy.
    
    Args:
        labels_gt (np.array): GT labels of shape (N, ).
        labels_pred (np.array): Predicted labels of shape (N, ).
        
    Returns:
        float: Accuracy, in range [0, 1].
    """
    
    return np.sum(labels_tr == labels_pred) / labels_tr.shape[0]

def logistic_regression_train(data, labels, max_iters=10, lr=0.001, alpha=1e-3):
    """ Training function for binary class logistic regression. 
    
    Args:
        data (np.array): Dataset of shape (N, D).
        labels (np.array): Labels of shape (N, ).
        max_iters (integer): Maximum number of iterations. Default:10
        lr (float): The learning rate of  the gradient step. Default:0.001
        alpha (int) : regularization factor
        
    Returns:
        np.array: weights of shape(D, )
    """

    #initialize the weights randomly according to a Gaussian distribution
    weights = np.random.normal(0., 0.01, [data.shape[1],])
    for it in range(max_iters):
        ########## find gradient and do a gradient step
        xx = data @ weights
        gradient = gradient_logistic(data, labels, weights,alpha)
        weights = weights - lr*gradient
        ##################################
        predictions = logistic_regression_classify(data, weights)
        print(f'loss: {loss_logistic(data, labels, weights,alpha) : .5f}, acc: {accuracy(labels, predictions): .5f}')
        
    return weights, loss_logistic(data, labels, weights,alpha)

In [12]:
weights, loss = logistic_regression_train(data_train, yb_train, max_iters=100, lr=5e-4,alpha=1e-7)

loss:  0.92544, acc:  0.62918
loss:  0.92538, acc:  0.62938
loss:  0.92531, acc:  0.62971
loss:  0.92525, acc:  0.62993
loss:  0.92518, acc:  0.63019
loss:  0.92512, acc:  0.63047
loss:  0.92505, acc:  0.63078
loss:  0.92499, acc:  0.63108
loss:  0.92493, acc:  0.63128
loss:  0.92486, acc:  0.63158
loss:  0.92480, acc:  0.63187
loss:  0.92473, acc:  0.63206
loss:  0.92467, acc:  0.63228
loss:  0.92461, acc:  0.63239
loss:  0.92454, acc:  0.63260
loss:  0.92448, acc:  0.63287
loss:  0.92442, acc:  0.63315
loss:  0.92435, acc:  0.63339
loss:  0.92429, acc:  0.63357
loss:  0.92423, acc:  0.63377
loss:  0.92417, acc:  0.63396
loss:  0.92410, acc:  0.63408
loss:  0.92404, acc:  0.63428
loss:  0.92398, acc:  0.63462
loss:  0.92392, acc:  0.63475
loss:  0.92386, acc:  0.63496
loss:  0.92379, acc:  0.63513
loss:  0.92373, acc:  0.63516
loss:  0.92367, acc:  0.63539
loss:  0.92361, acc:  0.63552
loss:  0.92355, acc:  0.63567
loss:  0.92349, acc:  0.63579
loss:  0.92343, acc:  0.63599
loss:  0.9

## Test

In [13]:
weights, loss

(array([-0.00201736, -0.00502268,  0.01094692, -0.01781726, -0.02185558,
        -0.01154768, -0.02055961,  0.00703959,  0.00636955,  0.01129834,
        -0.02018703,  0.00267791, -0.00360718,  0.0052221 ,  0.01782481,
        -0.00438853, -0.00424051,  0.00963502,  0.00152777, -0.00694184,
         0.00103654,  0.013164  , -0.01991231,  0.00703761,  0.02435765,
        -0.01250688,  0.02185152, -0.0025418 ,  0.00755125,  0.00844345,
         0.01914765, -0.01168945,  0.00505958,  0.00237452]),
 0.9196352490949115)

In [14]:
yb_te, data_test, ids_te = load_csv_data('test.csv', sub_sample=False)
test = np.c_[np.ones((data_test.shape[0],1)),data_test]
jet_num = np.zeros((len(test),4))
for i in range(4):
    row = np.where(test[:,23] == i)[0]
    jet_num[row,i] = 1
test = np.c_[test[:,:23],test[:,24:]]
test = np.c_[test,jet_num]
pos = np.where(test == -999)
test[test == -999] = np.nan
c_mean = np.nanmean(test,axis=0)
test[np.isnan(test)] = c_mean[pos[1]]

In [15]:
test.shape

(568238, 34)

In [16]:
norm_test = (test - mean) / sdt
test_predict = logistic_regression_classify(norm_test, weights)
test_predict[test_predict == 1] = -1
test_predict[test_predict == 0] = 1
create_csv_submission(ids_te, test_predict, "submission.csv")