# Import and Constract Dataset

In [134]:
# Import
import numpy as np
import scipy.optimize
from scipy.optimize import minimize
import pandas as pd

In [135]:
# Construct Data
root_dir: str = "data//"
file_ext: str = ".csv"
delimiter: str = ","


def get_path(file_name: str) -> str:
    return root_dir + file_name + file_ext


def save_csv(file_name, data: np.ndarray, delimiter):
    frame = pd.DataFrame(data)
    frame.to_csv(get_path(file_name), index=False, sep=delimiter,
                 float_format="%.17f")


def read_csv(file_name, delimiter) -> np.ndarray:
    df = pd.read_csv(get_path(file_name), delimiter=delimiter, float_precision="round_trip")
    if df.values.shape[1] == 1:
        return df.values.T
    return df.values


def construct_dataset():
    m = 150  # data rows
    d = 75  # feature dimensions
    X: np.ndarray = np.random.rand(m, d)

    theta = np.zeros(shape=(d, 1))

    theta[:10] = np.array([10 if np.random.randint(0, 2) == 0 else -10 for _ in range(10)]).reshape((10, 1))

    epsilon = np.random.normal(loc=0, scale=0.1, size=(m, 1))

    y: np.ndarray = np.dot(X, theta) + epsilon

    save_csv("X_train", X[:80], delimiter=delimiter)
    save_csv("X_validation", X[80:100], delimiter=delimiter)
    save_csv("X_test", X[100:], delimiter=delimiter)
    save_csv("Y_train", y[:80], delimiter=delimiter)
    save_csv("Y_validation", y[80:100], delimiter=delimiter)
    save_csv("Y_test", y[100:], delimiter=delimiter)

In [136]:
# Reload data
construct_dataset()
X_train: np.ndarray = read_csv("X_train", delimiter=delimiter)
X_valid: np.ndarray = read_csv("X_validation", delimiter=delimiter)
X_test: np.ndarray = read_csv("X_test", delimiter=delimiter)
y_train: np.ndarray = read_csv("Y_train", delimiter=delimiter)
y_valid: np.ndarray = read_csv("Y_validation", delimiter=delimiter)
y_test: np.ndarray = read_csv("Y_test", delimiter=delimiter)

# Ridge Regression

In [137]:
def ridge_regression_test():
    (N, D) = X_train.shape

    def ridge(Lambda):
        def ridge_obj(obj_theta):
            return ((np.linalg.norm(np.dot(X_train, obj_theta) - y_train)) ** 2) \
                   / (2 * N) + Lambda * (np.linalg.norm(obj_theta)) ** 2

        return ridge_obj

    def compute_loss(theta):
        return ((np.linalg.norm(np.dot(X_valid, theta) - y_valid)) ** 2) / (2 * N)

    w = np.random.rand(D)

    min_lambda = 0
    min_loss = 1e100
    min_opt_result: scipy.optimize.OptimizeResult
    for i in range(-5, 6):
        Lambda = 10 ** i
        w_opt = minimize(ridge(Lambda), w)
        loss = compute_loss(w_opt.x)
        if loss < min_loss:
            min_loss = loss
            min_opt_result = w_opt
            min_lambda = Lambda
        print("lambda: ", Lambda, "loss: ", loss)
    print("----------------")
    print("Choose lambda: ", min_lambda, "min loss", min_loss)
    print("theta result is: \n", min_opt_result.x)

    true_zero_count = 0
    thresh_hold = 1e-3
    small_count = 0
    for ele in min_opt_result.x:
        if ele == 0:
            true_zero_count = true_zero_count + 1
        elif ele <= thresh_hold:
            small_count = small_count + 1
    print("True zero count is {0}, values smaller than {1} is {2}"
          .format(true_zero_count, thresh_hold, small_count))


construct_dataset()
ridge_regression_test()

lambda:  1e-05 loss:  0.009513123344617864
lambda:  0.0001 loss:  0.14582335587930637
lambda:  0.001 loss:  0.815006522048313
lambda:  0.01 loss:  1.9264346592704147
lambda:  0.1 loss:  5.194430634934523
lambda:  1 loss:  8.110925613622788
lambda:  10 loss:  9.514414393989966
lambda:  100 loss:  14.590096622253947
lambda:  1000 loss:  16.04179128127179
lambda:  10000 loss:  16.209789852424393
lambda:  100000 loss:  16.226848278192513
----------------
Choose lambda:  1e-05 min loss 0.009513123344617864
theta result is: 
 [ 9.74533006e+00  9.92172264e+00 -9.95730131e+00 -9.90629162e+00
  9.99360982e+00 -1.00330474e+01  1.00189008e+01 -9.97777891e+00
  9.93426813e+00  9.94365176e+00 -2.01581492e-01  5.89248823e-02
 -1.12624707e-01  2.49848184e-02 -2.44866553e-02  2.59537571e-02
  7.81554090e-02 -3.44079984e-02 -5.24288483e-02 -9.48556869e-02
  1.85144774e-01 -4.78952863e-02 -1.12704776e-01 -7.76567829e-02
  8.66276413e-03  7.45913372e-02  1.23171200e-01 -6.87315535e-02
 -1.41875503e-01  1