# Import and Construct Dataset

In [33]:
# Import
import numpy as np
from scipy.optimize import minimize
import pandas as pd

In [34]:
# Construct Data
root_dir: str = "data//"
file_ext: str = ".csv"
delimiter: str = ","


def get_path(file_name: str) -> str:
    return root_dir + file_name + file_ext


def save_csv(file_name, data: np.ndarray, delimiter):
    frame = pd.DataFrame(data)
    frame.to_csv(get_path(file_name), index=False, sep=delimiter,
                 float_format="%.17f")


def read_csv(file_name, delimiter) -> np.ndarray:
    df = pd.read_csv(get_path(file_name), delimiter=delimiter, float_precision="round_trip")
    if df.values.shape[1] == 1:
        return df.values.T
    return df.values


def construct_dataset():
    m = 150  # data rows
    d = 75  # feature dimensions
    X: np.ndarray = np.random.rand(m, d)

    theta = np.zeros(shape=(d, 1))

    theta[:10] = np.array([10 if np.random.randint(0, 2) == 0 else -10 for _ in range(10)]).reshape((10, 1))

    epsilon = np.random.normal(loc=0, scale=0.1, size=(m, 1))

    y: np.ndarray = np.dot(X, theta) + epsilon

    save_csv("X_train", X[:80], delimiter=delimiter)
    save_csv("X_validation", X[80:100], delimiter=delimiter)
    save_csv("X_test", X[100:], delimiter=delimiter)
    save_csv("Y_train", y[:80], delimiter=delimiter)
    save_csv("Y_validation", y[80:100], delimiter=delimiter)
    save_csv("Y_test", y[100:], delimiter=delimiter)

In [35]:
# Reload data
construct_dataset()
X_train: np.ndarray = read_csv("X_train", delimiter=delimiter)
X_valid: np.ndarray = read_csv("X_validation", delimiter=delimiter)
X_test: np.ndarray = read_csv("X_test", delimiter=delimiter)
y_train: np.ndarray = read_csv("Y_train", delimiter=delimiter)
y_valid: np.ndarray = read_csv("Y_validation", delimiter=delimiter)
y_test: np.ndarray = read_csv("Y_test", delimiter=delimiter)

# Ridge Regression

In [39]:
def ridge_regression_test():
    num_train_record, num_feature = X_train.shape
    num_test_record = X_test.shape[0]

    def ridge(Lambda):
        def ridge_obj(obj_theta):
            return ((np.linalg.norm(np.dot(X_train, obj_theta) - y_train)) ** 2) \
                   / (2 * num_train_record) + Lambda * (np.linalg.norm(obj_theta)) ** 2

        return ridge_obj

    def compute_loss(theta):
        return ((np.linalg.norm(np.dot(X_test, theta) - y_test)) ** 2) / (2 * num_test_record)

    w = np.random.rand(num_feature)

    min_lambda = 0
    min_loss = 1e100
    optimized_theta: np.ndarray
    for i in range(-8, 3):
        Lambda = 10 ** i
        w_opt = minimize(ridge(Lambda), w)
        loss = compute_loss(w_opt.x)
        if loss < min_loss:
            min_loss = loss
            optimized_theta = w_opt.x
            min_lambda = Lambda
        print("lambda: {0} \t\t loss: {1}".format(Lambda, loss))
    print("----------------")
    print("Choose lambda: {0} \t\t min loss: {1}".format(min_lambda, min_loss))

    true_zero_count = 0
    thresh_hold = 1e-3
    small_count = 0
    for ele in optimized_theta:
        if ele == 0:
            true_zero_count = true_zero_count + 1
        elif ele <= thresh_hold:
            small_count = small_count + 1
    print("True zero component number is {0}, "
          "component smaller than {1} is {2}, "
          "over {3} component"
          .format(true_zero_count,
                  thresh_hold,
                  small_count,
                  optimized_theta.shape[0]))

    print("----------------")
    print("final theta by ridge regression is: \n", optimized_theta)


construct_dataset()
ridge_regression_test()

lambda: 1e-08 		 loss: 0.08048407694370177
lambda: 1e-07 		 loss: 0.08023960436634042
lambda: 1e-06 		 loss: 0.07787943428958276
lambda: 1e-05 		 loss: 0.061802164480979534
lambda: 0.0001 		 loss: 0.21914404285072464
lambda: 0.001 		 loss: 2.3591711093171335
lambda: 0.01 		 loss: 9.658110118037714
lambda: 0.1 		 loss: 22.93264906232866
lambda: 1 		 loss: 33.780164993451635
lambda: 10 		 loss: 47.612650707601304
lambda: 100 		 loss: 72.8741049243079
----------------
Choose lambda: 1e-05 		 min loss: 0.061802164480979534
True zero component number is 0, component smaller than 0.001 is 35, over 75 component
----------------
final theta by ridge regression is: 
 [ 10.12527254   9.98356212 -10.15981968 -10.14783454  10.12806298
   9.98983187  -9.87809839  10.0450749   -9.89277173   9.82648974
   0.02086788   0.05158837   0.1812598   -0.20810032   0.05579024
  -0.14806439  -0.11312448   0.26128307   0.17724048   0.17177662
  -0.19187157  -0.13850719   0.12796015   0.10403308  -0.03432035
  -