In [None]:
import numpy as np
import matplotlib.pyplot as plt
from costs import*
from gradient_descent import*
from plots import gradient_descent_visualization
from logistic_regression import*
from helpers import*

# I) Loading Training and Testing Data

In [None]:
def load_data(train_path, test_path):
    data_train = np.genfromtxt(train_path, delimiter=',', dtype='str', skip_header=1, usecols=[])
    data_test = np.genfromtxt(test_path, delimiter=',', dtype='str', skip_header=1, usecols=[])
    
    x_train = data_train[:, 2:].astype(np.float)
    x_test = data_test[:, 2:].astype(np.float)
    
    y_train = data_train[:, 1]
    y_train = np.where(y_train =='s', 1, y_train)
    #y_train = np.where(y_train =='b', -1, y_train).astype(np.float)
    y_train = np.where(y_train =='b', 0, y_train).astype(np.float)
    y_train = np.reshape(y_train, (y_train.shape[0], 1))
    
    id_train = data_train[:, 0].astype(np.float)
    id_train = np.reshape(id_train, (id_train.shape[0], 1))
    id_train = id_train
    id_test = data_test[:, 0].astype(np.float)
    id_test = np.reshape(id_test, (id_test.shape[0], 1))
    id_test = id_test
    
    return x_train, x_test, y_train, id_train, id_test

#here we will use x_test as the "data" to input for our prediction

In [None]:
def create_local_test_set(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    # set seed
    np.random.seed(seed)
    # ***************************************************
    # split the data based on the given ratio: 
    # ***************************************************

    split_index = int(len(x)*ratio)
    
    indices = np.random.permutation(len(x))
    indices_train = indices[:split_index]
    indices_test = indices[split_index:]
    
    train_x = x[indices_train]
    train_y = y[indices_train]
    our_test_x = x[indices_test]
    our_test_y = y[indices_test]
    
    return train_x, train_y, our_test_x, our_test_y

## Dealing with  Outliers

In [None]:
def fixing_outliers(x):
    outliers_indicies = np.where(x == -999.0)
    x_cleaned = np.delete(x, outliers_indicies[0], 0)
    #mean_cleaned_cols = np.mean(x_cleaned, axis=0)
    #x[outliers_indicies] = np.take(mean_cleaned_cols, outliers_indicies[1])
    median_cleaned_cols = np.median(x_cleaned, axis = 0)
    x[outliers_indicies] = np.take(median_cleaned_cols, outliers_indicies[1])
    return x

## Standardize Data

In [None]:
def standardize(x):
    """Standardize the original data set."""
    #mean_x = np.mean(x)
    median_x = np.median(x)
    #x = x - mean_x
    x = x - median_x
    std_x = np.std(x)
    x = x / std_x
    #return x, mean_x, std_x
    return x, median_x, std_x

## Apply PCA

In [None]:
def apply_pca(x):
    mean_x = np.mean(x)
    x_c = x - mean_x
    x_cov = np.cov(x_c.T)
    eig_values, eig_vectors = np.linalg.eig(x_cov)
    explained_variances = []
    for i in range(len(eig_values)):
        explained_variances.append(eig_values[i] / np.sum(eig_values))
        if np.sum(explained_variances) >= 0.96:
            break
    print(np.sum(explained_variances), '\n', explained_variances)
    selected_vectors = eig_vectors[:len(explained_variances)]
    x_projected = x_c@selected_vectors.T
    return x_projected

# II) Apply Previous Functions

In [None]:
## THIS IS GASSER'S ORIGINAL VERSION
########################################################################################

#Loading Dataset
#Louise_path = '/Users/louiseplacidet/Desktop/Machine Learning/Project 1/Git_ML_P1/Data/'

#data_train_path = Louise_path + "Train.csv"
#data_train_path = "Train.csv"
#data_test_path = Louise_path + "test.csv"
#x_train, x_test, y_train, id_train, id_test = load_data(data_train_path, data_test_path)

#Replace each -999 with feature mean value
#x_train = fixing_outliers(x_train)
#x_test = fixing_outliers(x_test)

#Standardize data
#x_train, mean_train, std_train = standardize(x_train)
#x_test, mean_test, std_test = standardize(x_test)

#Apply PCA
#x_train_projected = apply_pca(x_train)
#x_test_projected = apply_pca(x_test)
#print(x_test_projected.shape)

#Adding Offset
#tx_train = np.c_[np.ones(x_train_projected.shape[0]), x_train_projected]
#tx_test = np.c_[np.ones(x_test_projected.shape[0]), x_test_projected]

In [None]:
## LOUISE'S MODIFIED VERSION
########################################################################################

#Loading Dataset
Louise_path = '/Users/louiseplacidet/Desktop/Machine Learning/Project 1/Git_ML_P1/Data/'

data_train_path = Louise_path + "Train.csv"
#data_train_path = "Train.csv"
data_test_path = Louise_path + "test.csv"
x_train, x_test, y_train, id_train, id_test = load_data(data_train_path, data_test_path)

# Getting our test local set:
x_train, y_train, x_our_test, y_our_test = create_local_test_set(x_train, y_train, 0.8,seed=1)

#Replace each -999 with feature mean value
x_train = fixing_outliers(x_train)

#Standardize data
x_train, mean_train, std_train = standardize(x_train)

#Apply PCA
x_train_projected = apply_pca(x_train)
our_x_test_projected = apply_pca(x_our_test)

#Adding Offset
tx_train = np.c_[np.ones(x_train_projected.shape[0]), x_train_projected]
our_tx_test = np.c_[np.ones(our_x_test_projected.shape[0]), our_x_test_projected]

In [None]:
print("shape of x_train: ("+str(x_train.shape[0])+","+str(x_train.shape[1])+")")

In [None]:
print("shape of y_train: ("+str(y_train.shape[0])+","+str(y_train.shape[1])+")")

In [None]:
print("shape of x_our_test: ("+str(x_our_test.shape[0])+","+str(x_our_test.shape[1])+")")

In [None]:
print("shape of y_our_test: ("+str(y_our_test.shape[0])+","+str(y_our_test.shape[1])+")")

In [None]:
print("shape of tx_train: ("+str(tx_train.shape[0])+","+str(tx_train.shape[1])+")")

In [None]:
print("shape of our_tx_test: ("+str(our_tx_test.shape[0])+","+str(our_tx_test.shape[1])+")")

## Apply Logistic Rregression

In [None]:
#Apply Logistic Regression
##Define the parameters of the algorithm
max_iter = 100
threshold = 0.001
gamma = 0.001
lambda_ = 0.1
losses = []
w = np.zeros((tx_train.shape[1], 1))

##Start the logistic regression
for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y_train, tx_train, w, gamma)
        print("Iteration "+str(iter)+" "+"Loss = "+str(loss))
print("loss={l}".format(l=calculate_loss(y_train, tx_train, w)))

## Apply Regularized Logistic Regression

In [None]:
def penalized_logistic_regression(y, tx, w, lambda_):
    loss = calculate_loss(y, tx, w) + (lambda_ / 2) * np.sum(w**2)
    gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
    
    return loss, gradient

In [None]:
def learning_by_penalized_gradient_descent(y, tx, w, gamma, lambda_):

    loss, gradient = penalized_logistic_regression(y, tx, w, lambda_)
    w = w - gamma * gradient
    
    return loss, w

In [None]:
#Apply Regularized Logistic Regression
##Define the parameters of the algorithm
max_iter = 100
threshold = 0.001
gamma = 0.00001
lambda_ = 0.5
losses = []
w = np.zeros((tx_train.shape[1], 1))

##Start the regularized logistic regression
ws = []
losses = []

for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient_descent(y_train, tx_train, w, gamma, lambda_)
        losses.append(loss)
        ws.append(w)
        print("Iteration "+str(iter)+" "+"Loss = "+str(loss))
    
best_loss = np.min(np.abs(losses))
index_best = losses.index(best_loss)
best_w = ws[index_best]

print("loss={l}".format(l=calculate_loss(y_train, tx_train, w)))
print("best_loss = "+str(best_loss))

## Newton Logistic Regression

In [None]:
def calculate_hessian(y, tx, w):
    """return the Hessian of the loss function with respect to parameters w."""
    # ***************************************************
    # calculate Hessian: 
    # ***************************************************
    S = np.eye(len(y))*(np.array(( sigmoid(tx@w) * (1 - sigmoid(tx@w)))))
        
    hessian = tx.T @ S @ tx
    return hessian

In [None]:
def newton_logistic_regression(y,tx,w):
        
    loss = calculate_loss(y, tx, w)
    gradient = calculate_gradient(y, tx, w)
    hessian = calculate_hessian(y,tx,w)
    
    return loss, gradient, hessian

In [None]:
def learning_by_newton_method(y, tx, w, gamma = 1):
    
    loss, gradient, hessian = logistic_regression(y, tx, w)
    w = w - gamma * np.linalg.inv(hessian) @ gradient
    
    return loss, w

In [None]:
#Apply Logistic Regression with Newton Method
##Define the parameters of the algorithm
max_iter = 100
threshold = 0.001
gamma = 0.00001
lambda_ = 0.5
losses = []
w = np.zeros((tx_train.shape[1], 1))

##Start the logistic regression
ws = []
losses = []

for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_newton_method(y_train, tx_train, w, gamma)
        losses.append(loss)
        ws.append(w)
        print("Iteration "+str(iter)+" "+"Loss = "+str(loss))
    
best_loss = np.min(np.abs(losses))
index_best = losses.index(best_loss)
best_w = ws[index_best]

print("loss={l}".format(l=calculate_loss(y_train, tx_train, w)))
print("best_loss = "+str(best_loss))

# III) Testing our Model

## Predicting labels

In [None]:
def predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred

In [None]:
def our_predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1
    
    return y_pred

In [None]:
our_y_predict = our_predict_labels(best_w, our_tx_test)

## Testing our Model with our Test Data

In [None]:
our_test_loss = calculate_loss(our_y_predict, x_our_test, best_w)

In [None]:
accuracy = [our_y_predict == x_our_test]
accuracy

## Creating submission csv

In [None]:
import csv

In [None]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
create_csv_submission(id_test,y_predict,"first try")