In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from proj1_helpers import *
from helpers import batch_iter

In [2]:
DATA_TRAIN_PATH = 'D:/dev/EPFL/Machine Learning/Project 1/Repo/data/train.csv'

Y, X, ids = load_csv_data(DATA_TRAIN_PATH)

df = pd.read_csv(DATA_TRAIN_PATH)

In [3]:
MISSING_VAL = -999

def clean_data(x):
    """ 
    This function performs cleaning of the data.
    It replaces missing values in a column with the mean of non missing values in the same column
    """
    # This function finds all the missing values in the given vector
    missing = lambda x: np.abs(np.subtract(x, MISSING_VAL)) < 1e-8
        
    # for each column of x, we calculate the mean of non missing values (value that is not equal to MISSING_VAL).    
    x_mean = [np.mean(x[~missing(x[:, i]), i]) for i in range(x.shape[1])]
    
    # for each column of x, we replace missing values with the corresponding mean of that column calculated above.
    for i in range(x.shape[1]):
        x[missing(x[:, i]), i] = x_mean[i]
    
    return np.array(x)

def normalize(x):
    """This function normalizes the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

def clean_and_normalize(x):
    return normalize(clean_data(x))
    
x, mean_x, std_x = clean_and_normalize(np.array([[1, 1],[2, 2], [1.5, 1.5]], dtype=np.float64))

x

array([[-1.22474487, -1.22474487],
       [ 1.22474487,  1.22474487],
       [ 0.        ,  0.        ]])

In [4]:
X = clean_and_normalize(X)[0]

X

array([[ 0.9292726 ,  0.04457539,  0.51509592, ..., -0.46918386,
        -0.50704195,  0.67478267],
       [ 1.15822487,  0.21896718,  0.5702067 , ..., -0.48194091,
        -0.48183633, -0.01074939],
       [ 0.75999168,  1.17081027,  0.80171682, ..., -0.48194091,
        -0.48183633, -0.03087583],
       ..., 
       [ 0.59285022,  0.13497623,  0.29102494, ..., -0.48194091,
        -0.48183633, -0.0538964 ],
       [ 0.48578774, -0.28450975,  0.21941557, ..., -0.48194091,
        -0.48183633, -0.4818202 ],
       [ 0.75999168,  0.25960731,  0.2399904 , ..., -0.48194091,
        -0.48183633, -0.4818202 ]])

In [None]:
plt.plot(X[0:1000,1])
plt.show()

In [None]:
df.describe()

In [None]:
df1 = pd.DataFrame(X, columns=df.columns.drop(["Id", "Prediction"]))

In [None]:
df1.describe()

In [None]:
# Outputs persentage of missing values (-999) per column
(df[df.columns.drop(["Id", "Prediction"])].where(df == -999).count() / df.shape[0] * 100).round(2)

In [None]:
df.head(1000).where(df > -999).plot(subplots=True, kind='line', figsize=(12, 200))
plt.show()

In [None]:
df1.head(1000).plot(subplots=True, kind='line', figsize=(12, 200))
plt.show()

In [None]:
df[df.columns.drop(["Id"])].head(1000).where(df > -999).plot(kind='box', figsize=(14, 100))
plt.show()

In [None]:
def sigmoid(t):
    return np.divide(1, np.add(1, np.exp(-t)))


def compute_log_likelihood(y, tx, w):
    loss = 0
    for i in range(tx.shape[0]):
        #x_t = tx[i].transpose()
        y_est = tx[i].dot(w)
        loss = loss + np.log(1 + np.exp(y_est)) - y[i] * y_est
    return loss

def compute_log_likelihood(y, tx, w):
    loss = 0
    for i in range(tx.shape[0]):
        #x_t = tx[i].transpose()
        y_est = tx[i].dot(w)
        loss = loss + np.log(1 + np.exp(y_est)) - y[i] * y_est
    return loss

def compute_log_likelihood_penalized(y, tx, w, lambda_):
    loss = 0
    for i in range(tx.shape[0]):
        #x_t = tx[i].transpose()
        y_est = tx[i].dot(w)
        loss = loss + np.log(1 + np.exp(y_est)) - y[i] * y_est
    return np.add(loss, lambda_ * w.transpose().dot(w))

def compute_gradient_log_likelihood(y, tx, w):
    """Computes gradient of the max likelihood estimator for logistic regression"""
    xt_t = tx.transpose()
    return xt_t.dot((sigmoid(tx.dot(w)) - y))

def compute_gradient_log_likelihood_penalized(y, tx, w, lambda_):
    """Computes gradient of the max likelihood estimator for logistic regression"""
    xt_t = tx.transpose()
    gradient = xt_t.dot((sigmoid(tx.dot(w)) - y))
    return np.add(gradient, np.multiply(2 * lambda_, w))

def logistic_regression_SGD(y, tx, initial_w, batch_size, max_iters, gamma):
    """ Logistic regression using Schocastic gradient descent algorithm """
    
    # Define parameters to store w and loss
    ws = [initial_w]
    w = initial_w
    losses = []

    np.seterr(all='print')
    
    for iter in range(max_iters):
        
        y_batch, tx_batch = next(batch_iter(y, tx, batch_size, num_batches=1, shuffle=True))
        
        grad, loss = compute_gradient_log_likelihood_penalized(y_batch, tx_batch, w, 0.1), compute_log_likelihood_penalized(y_batch, tx_batch, w, 0.1)
        
        w = np.subtract(w, np.multiply(gamma, grad))
        
        losses.append(loss)
        
        if iter % 10 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < 1e-8:
            break  
        
    return w

#X = X[0]
tx = np.c_[np.ones((X.shape[0], 1)), X]
initial_w = np.zeros((tx.shape[1], 1))
Y = Y.reshape(len(Y), 1)
ww = logistic_regression_SGD(Y, tx, initial_w, 250, 100, 1e-8)

ww
#compute_gradient_log_likelihood(Y, tx, w)


In [None]:
def gen(n):
    for i in range(n):
        yield i + 1
        
for i in gen(5):
    print(i)

In [5]:
import logistic_regression as lr


tx = np.c_[np.ones((X.shape[0], 1)), X]
initial_w = np.zeros((tx.shape[1], 1))
Y = Y.reshape(len(Y), 1)
ww = lr.logistic_regression_SGD(Y, tx, initial_w, 250, 100, 1e-8, 0.1)

ww

Current iteration=0, the loss=[[ 173.28679514]]
Current iteration=10, the loss=[[ 173.21021542]]
Current iteration=20, the loss=[[ 173.16435761]]
Current iteration=30, the loss=[[ 173.10587582]]
Current iteration=40, the loss=[[ 172.97204096]]
Current iteration=50, the loss=[[ 172.95048784]]
Current iteration=60, the loss=[[ 172.87980481]]
Current iteration=70, the loss=[[ 172.80809548]]
Current iteration=80, the loss=[[ 172.803401]]
Current iteration=90, the loss=[[ 172.53083483]]


array([[ -2.05302262e-04],
       [ -1.54855014e-04],
       [ -3.39342506e-05],
       [ -7.20150841e-05],
       [  7.32317275e-06],
       [  9.43150388e-05],
       [ -5.85027547e-04],
       [  9.98724575e-05],
       [  9.39671750e-05],
       [  5.83058739e-05],
       [ -1.89884806e-04],
       [  9.55094679e-05],
       [  9.99402458e-05],
       [  9.80513185e-05],
       [  3.10243819e-05],
       [  9.89181725e-05],
       [  9.89187151e-05],
       [ -4.64893772e-07],
       [  9.89843176e-05],
       [  9.88625697e-05],
       [  1.35959786e-05],
       [  9.90343176e-05],
       [ -2.98437868e-04],
       [  9.71860424e-05],
       [ -6.77071484e-05],
       [  9.89160245e-05],
       [  9.89433543e-05],
       [ -2.27648851e-05],
       [  9.89569763e-05],
       [  9.89281484e-05],
       [ -2.26067453e-05]])