In [1]:
import numpy as np
import matplotlib.pyplot as plt
from costs import*
from gradient_descent import*
from plots import gradient_descent_visualization
from logistic_regression import*
from helpers import*

## Loading Training and Testing Data

In [2]:
def load_data(train_path, test_path):
    data_train = np.genfromtxt(train_path, delimiter=',', dtype='str', skip_header=1, usecols=[])
    data_test = np.genfromtxt(test_path, delimiter=',', dtype='str', skip_header=1, usecols=[])
    
    x_train = data_train[:, 2:].astype(np.float)
    x_test = data_test[:, 2:].astype(np.float)
    
    y_train = data_train[:, 1]
    y_train = np.where(y_train =='s', 1, y_train)
    y_train = np.where(y_train =='b', -1, y_train).astype(np.float)
    y_train = np.reshape(y_train, (y_train.shape[0], 1))
    
    id_train = data_train[:, 0].astype(np.float)
    id_train = np.reshape(id_train, (id_train.shape[0], 1))
    id_train = id_train
    id_test = data_test[:, 0].astype(np.float)
    id_test = np.reshape(id_test, (id_test.shape[0], 1))
    id_test = id_test
    
    return x_train, x_test, y_train, id_train, id_test

## Replacing Outliers with Mean Feature Value

In [3]:
def fixing_outliers(x):
    outliers_indicies = np.where(x == -999.0)
    x_cleaned = np.delete(x, outliers_indicies[0], 0)
    mean_cleaned_cols = np.mean(x_cleaned, axis=0)
    x[outliers_indicies] = np.take(mean_cleaned_cols, outliers_indicies[1])
    return x

## Standardize Data

In [4]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

## Apply PCA

In [5]:
def apply_pca(x):
    mean_x = np.mean(x)
    x_c = x - mean_x
    x_cov = np.cov(x_c.T)
    eig_values, eig_vectors = np.linalg.eig(x_cov)
    explained_variances = []
    for i in range(len(eig_values)):
        explained_variances.append(eig_values[i] / np.sum(eig_values))
        if np.sum(explained_variances) >= 0.96:
            break
    print(np.sum(explained_variances), '\n', explained_variances)
    selected_vectors = eig_vectors[:len(explained_variances)]
    x_projected = x_c@selected_vectors.T
    return x_projected

## Apply Previous Functions

In [None]:
#Loading Dataset
data_train_path = "Train.csv"
data_test_path = "test.csv"
x_train, x_test, y_train, id_train, id_test = load_data(data_train_path, data_test_path)

#Replace each -999 with feature mean value
x_train = fixing_outliers(x_train)
x_test = fixing_outliers(x_test)

#Standardize data
x_train, mean_train, std_train = standardize(x_train)
x_test, mean_test, std_test = standardize(x_test)

#Apply PCA
x_train_projected = apply_pca(x_train)
x_test_projected = apply_pca(x_test)
print(x_test_projected.shape)

#Adding Offset
tx_train = np.c_[np.ones(x_train_projected.shape[0]), x_train_projected]
tx_test = np.c_[np.ones(x_test_projected.shape[0]), x_test_projected]

In [10]:
#Apply Logistic Regression
##Define the parameters of the algorithm
max_iter = 100
threshold = 0.001
gamma = 0.001
lambda_ = 0.1
losses = []
w = np.zeros((tx_train.shape[1], 1))

##Start the logistic regression
for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y_train, tx_train, w, gamma)
        print("Iteration "+str(iter)+" "+"Loss = "+str(loss))
print("loss={l}".format(l=calculate_loss(y_train, tx_train, w)))

Iteration 0 Loss = 173286.79513998624


  return 1/(1+np.exp(-t))


Iteration 1 Loss = -114170424.32169206
Iteration 2 Loss = -151678615.17798772
Iteration 3 Loss = -189186806.03428337
Iteration 4 Loss = -226694996.8905791
Iteration 5 Loss = -264203187.74687472
Iteration 6 Loss = -301711378.60317034
Iteration 7 Loss = -339219569.45946604
Iteration 8 Loss = -376727760.31576174
Iteration 9 Loss = -414235951.1720573
Iteration 10 Loss = -451744142.028353
Iteration 11 Loss = -489252332.88464874
Iteration 12 Loss = -526760523.7409442
Iteration 13 Loss = -564268714.59724
Iteration 14 Loss = -601776905.4535357
Iteration 15 Loss = -639285096.3098313
Iteration 16 Loss = -676793287.1661272
Iteration 17 Loss = -714301478.0224228
Iteration 18 Loss = -751809668.8787184
Iteration 19 Loss = -789317859.7350144
Iteration 20 Loss = -826826050.5913098
Iteration 21 Loss = -864334241.4476055
Iteration 22 Loss = -901842432.3039008
Iteration 23 Loss = -939350623.1601968
Iteration 24 Loss = -976858814.0164924
Iteration 25 Loss = -1014367004.872788
Iteration 26 Loss = -10518751