In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [3]:
from proj1_helpers import *

DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

print(y.shape)
print(tX.shape)

(250000,)
(250000, 30)


## Cleaning and standarization of x

In [4]:
def clean_tx(tx, threshold):
    row, col = tx.shape[0], tx.shape[1]
    empty = np.zeros(col)
    for i in range(col):
        count = np.count_nonzero(tx[:,i] == -999)
        empty[i] = count/row    
    
    return tx[:,(empty <= threshold)]
        
print(clean_tx(tX, 0.15).shape)

(250000, 19)


In [5]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis=0)
    std_x = np.std(x, axis=0)
    return (x - mean_x) / std_x

In [6]:
clean_tX = clean_tx(tX, 0.15)
std_tX = standardize(clean_tX)

In [7]:
row = std_tX.shape[0]
model_data = np.c_[np.ones(row), std_tX]

## Change of y

In [35]:
model_y = np.where(y == -1, 0, y)

# Do your thing crazy machine learning thing here :) ...


In [8]:
initial_w = np.zeros(model_data.shape[1])
gamma = 0.7
max_iters = 100

In [23]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    w = initial_w
    loss = 0
    for i in range(max_iters):
        for batch_y, batch_x in batch_iter(y, tx):
            gradient,_ = compute_gradient(batch_y, batch_x, w)
            w = w - gamma*gradient
            loss = compute_mse(y - (batch_x @ w))
            print("Stochastic Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=i, ti=max_iters - 1, l=loss))
    return w, loss

least_squares_SGD(y, model_data, initial_w, max_iters, gamma)

Stochastic Gradient Descent(0/99): loss=58.09307362488482
Stochastic Gradient Descent(1/99): loss=573.4605838780876
Stochastic Gradient Descent(2/99): loss=4508.326370049001
Stochastic Gradient Descent(3/99): loss=113277.33810345783
Stochastic Gradient Descent(4/99): loss=3837.1114137663158
Stochastic Gradient Descent(5/99): loss=59815.58030304029
Stochastic Gradient Descent(6/99): loss=319477.0153872045
Stochastic Gradient Descent(7/99): loss=630024.5558809399
Stochastic Gradient Descent(8/99): loss=301379.67805480555
Stochastic Gradient Descent(9/99): loss=22449361.06468936
Stochastic Gradient Descent(10/99): loss=32241410.263458643
Stochastic Gradient Descent(11/99): loss=1043376602.8759407
Stochastic Gradient Descent(12/99): loss=873055059.6739109
Stochastic Gradient Descent(13/99): loss=2985145246.767544
Stochastic Gradient Descent(14/99): loss=9024491665.861437
Stochastic Gradient Descent(15/99): loss=350702098.8435756
Stochastic Gradient Descent(16/99): loss=23447558.99928449
St

(array([[ 6.21018206e+37],
        [ 3.43355266e+37],
        [-1.80582317e+37],
        [-4.16438683e+37],
        [-1.46138710e+37],
        [-2.06904252e+37],
        [-3.77905647e+37],
        [ 1.84761256e+37],
        [-3.18490000e+37],
        [-2.54143238e+37],
        [-7.24006731e+37],
        [-1.92296985e+36],
        [-1.61175212e+36],
        [-8.69293661e+37],
        [-7.41157630e+37],
        [-6.82152543e+36],
        [ 2.00451494e+37],
        [-4.05825689e+37],
        [-4.41870115e+37],
        [-3.84090473e+37]]),
 2.916512589639328e+77)

## Logistic regression

In [36]:
initial_w = np.zeros(model_data.shape[1])
gamma = 0.7
max_iters = 100

In [44]:
def batch_iter(y, tx, batch_size=1, num_batches=1, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]



In [38]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [39]:
def calculate_loss(y, tx, w):
    sig = sigmoid(tx @ w)
    loss = y.T @ np.log(sig) + (1 - y).T @ np.log(1 - sig)
    return -loss

In [40]:
def gradient_computation(y, tx, w):
    sig = sigmoid(tx @ w)
    grad = tx.T @ (sig.flatten() - y)
    return grad

In [41]:
def regression(y, tx, w, gamma):
    loss = calculate_loss(y, tx, w)
    grad = gradient_computation(y, tx, w)
    w = w.flatten() - gamma*grad
    return w, loss

In [46]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    w = initial_w
    losses = []
    threshold = 1e-8
    for iter in range(max_iters):
        for batch_y, batch_x in batch_iter(y, tx):
            w, loss = regression(batch_y, batch_x, w, gamma)
        
        
        #if iter % 100 == 0:
           # print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
            losses.append(loss)
        #if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            #break
    # visualization
    #visualization(y, x, mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
    #print("loss={l}".format(l=calculate_loss(y, tx, w)))
    return w, losses[-1]
        
initial_w = np.zeros((model_data.shape[1], 1))
w_final, loss_final = logistic_regression(model_y, model_data, initial_w, max_iters, gamma)
print(loss_final)
#print(model_data.shape)

9.896730967031678e-07


In [48]:
def prediction(y, tx, w):
    pred = sigmoid(tx @ w)
    pred = np.where(pred > 0.5, 1, pred)
    pred = np.where(pred <= 0.5, 0, pred)
    count = 0
    for i, j in zip(y, pred):
        if i == j:
            count += 1
    return count/len(y)
    
prediction(model_y, model_data, w_final)

0.6585

## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)