# Implementatation of ridge regression

In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

### Load the training data into feature matrix, class labels, and event ids:

In [None]:
DATA_TRAIN_PATH = 'data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

### Now let us turn to the proper machine learning side of the problem

The run ridge regression function here computes everything we need in our problem. It works the follwing way.
- We define the parameters for our run, basically the degrees and the lambdas we want.
- The cross_validation function performs a k-fold cross-validation (takes a lot of time) and then returns the best degree of polynomial along with the associated lambda. 
- Then we just have to re-run the ridge regression once more with those parameters to get the optimal weights.
- The final step is to run the model on the testing data set, do our prediction and save the result

VERY IMPORTANT NOTE :
- The sanitation and standardization of the data are part of our modelling process, there should hence be included into the cross-validation process, as they differ for each sample that we consider.

In [None]:
import numpy as np
from costs import compute_mse
from build_polynomial import build_poly
from plots import cross_validation_visualization
from helpers import build_k_indices
from helpers import sanitize_NaN
from helpers import standardize

In [None]:
def ridge_regression(y, tx, lamb):
    """implement ridge regression."""
    return np.linalg.solve(np.dot(tx.T,tx)+lamb*np.identity(tx.shape[1]),np.dot(tx.T,y))#/(2*len(tx))

In [None]:
def cross_validation(y,tX,degrees,lambdas,k_fold,seed):
    """
        Computes the cross_validation for the given parameters and returns the best result for each polynomial degree.
        Note that we give the RAW data to the cross_validation, without any transformation on them.
    """

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # cross validation:    
    rmse_best = np.zeros(len(degrees))
    rmse_best_lambda = np.zeros(len(degrees))
    for j,degree in enumerate(degrees):
        
        print('\n Testing for a polynomial of degree ', degree)
        #Training and testing errors for each lambda, so we are able to visualize them afterwards.
        rmse_tr = np.zeros(len(lambdas))
        rmse_te = np.zeros(len(lambdas))
        
        for i,lambda_ in enumerate(lambdas):
            print('lambda=',round(lambda_,6),end=", ")
            
            #This is actually where the k-fold cross-validation is computed. We sum all the errors and then average them. 
            loss_tr_tot=0
            loss_te_tot=0
            for k in range(k_fold+1):
                loss_tr_tmp,loss_te_tmp =cross_validation_rr(y,tX,k_indices,k,lambda_,degree)
                loss_tr_tot += loss_tr_tmp
                loss_te_tot += loss_te_tmp
                
            rmse_tr[i] = loss_tr_tot/k_fold
            rmse_te[i] = loss_te_tot/k_fold
            print('RMSE_BEST_VALUE : ',rmse_te[i])
        rmse_best[j] = min(rmse_te)
        rmse_best_lambda[j] = lambdas[int(np.argmin(rmse_te))]
        cross_validation_visualization(lambdas, rmse_tr, rmse_te)
        
    print('\nBest error :',rmse_best)
    print('Best lambda :',rmse_best_lambda)
    return rmse_best,rmse_best_lambda

In [None]:

def cross_validation_rr(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression for each step of the k-fold cross validation."""
    
    # get k'th subgroup in test, others in train: 
    x_test = np.array(x[k_indices[k-1]])
    y_test = np.array(y[k_indices[k-1]])
    x_train = np.empty((0,x.shape[1]))
    y_train =  np.empty((0,1))
    #This for loops gets the other groups
    for k_iter,validation_points in enumerate(k_indices):
        if(k_iter!=k-1):
            x_train=np.append(x_train,x[validation_points],axis=0)
            y_train=np.append(y_train,y[validation_points])
    #we sanitize and standardize our training data here, and apply the same median, mean and variance to the testing data  
    x_train,median_train = sanitize_NaN(x_train)
    x_test,median_test = sanitize_NaN(x_test,median_train)
    
    x_train,mean_tr,std_tr = standardize(x_train)
    x_test, mean_te,ste_te = standardize(x_test,mean_tr,std_tr)
    
    # form data with polynomial degree:
    x_train_poly = build_poly(x_train,degree)
    x_test_poly = build_poly(x_test,degree)

    # ridge regression: 
    w_rr = ridge_regression(y_train,x_train_poly,lambda_)
    
    # calculate the loss for train and test data:
    #loss_tr = sum(abs(y_train-predict_labels(w_rr,x_train_poly)))/len(y_train)
    #loss_te = sum(abs(y_test-predict_labels(w_rr,x_test_poly)))/len(y_test)
    loss_tr = np.sqrt(2*compute_mse(y_train,x_train_poly,w_rr))
    loss_te = np.sqrt(2*compute_mse(y_test,x_test_poly,w_rr))
    return loss_tr, loss_te

In [None]:
#from ridge_regression import cross_validation, ridge_regression
from helpers import standardize
from helpers import sanitize_NaN
def run_ridge_regression_sanitized(y, tX):
    """ridge regression running script. works on the RAW data"""
    #Let us first clean the input
    tX,median_tr = sanitize_NaN(tX)
    #tX,mean_tr,std_tr = standardize(tX)
    
    # define parameters for our run   
    seed = 1
    degrees = np.array([3])
    k_fold = 4
    lambdas = np.logspace(-3,2,5)
    
    rmse,lambda_ = cross_validation(y,tX,degrees,lambdas,k_fold,seed)
    
    weights = ridge_regression(y, tX, lambda_[0])

    DATA_TEST_PATH = 'data/test.csv'  # Download train data and supply path here 
    y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
    tX_test_sorted = sanitize_NaN(tX_test,median_tr)
    
    OUTPUT_PATH = 'data/output_sanitized_normalization_test.csv' # Fill in desired name of output file for submission
    y_pred = predict_labels(weights, tX_test_sorted)
    create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

run_ridge_regression_sanitized(y,tX)

TODO : 
- train les data avec la mediane et variance et moyenne qu'on calculées avant.
- faire pareil pour la cross_validation