In [1]:
import numpy as np
from matplotlib import pyplot as plt
import csv
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# sklearn imports for crossval, model and losses

In [2]:
def plot_samples(X, y):
    fig, ax = plt.subplots()
    ax.scatter(X, y, s=4, label="Samples")
    ax.set_xlabel("X")
    ax.set_ylabel("y")

    fig.tight_layout()

In [3]:
def plot_samples_and_poly_predictions(X, y, model, n):
    """
    This function plots X and y predictions of a model in the range indicated by
    X and y (X must be 1-D) of a given linear scikit learn model. The X values for
    plotting the model predictions are extended to polynomial features of 
    degree n in descending order (x^n, x^(n-1), ...)
    """

    fig, ax = plt.subplots()
    ax.scatter(X, y, s=4, label="Samples")

    X_pred = np.linspace(np.min(X), np.max(X), 100)
    X_pred_poly = np.vander(X_pred, N=n+1)

    y_plot = model.predict(X_pred_poly)

    ax.plot(
        X_pred,
        y_plot,
        linewidth=2,
        label="Model",
    )
    ax.set_xlabel("X")
    ax.set_ylabel("y")
    ax.legend()
    
    fig.tight_layout()

In [4]:
def load_dataset_csv(data_path):
    """Reads from a csv file assuming two columns (2D) representing X and y
    of a regression problem."""
    with open(data_path, newline='') as f:
        csv_reader = csv.reader(f)    

        next(csv_reader)

        X = []
        y = []

        for row in csv_reader:
            X.append(row[0])
            y.append(row[1])
        
    X = np.array(X, dtype=float)
    y = np.array(y, dtype=float)

    return X, y

In [5]:
def kfold_crossval(regressor, X, y, k=10):
    """Function to evaluate a scikit learn model in 10-fold shuffled
    split cross validation. 
    """
    # Estimate model performance for given regularization parameter
    shuffled_kfold = ShuffleSplit(n_splits=10, train_size=0.9, random_state=42)
    val_losses = np.empty((k,))
    train_losses = np.empty((k,))
    
    for i, (train_index, val_index) in enumerate(shuffled_kfold.split(X)):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        regressor.fit(X_train, y_train)
        y_val_predict = regressor.predict(X_val)
        y_train_predict = regressor.predict(X_train)
        
        val_losses[i] = mean_squared_error(y_val, y_val_predict)
        train_losses[i] = mean_squared_error(y_train, y_train_predict)
    
    return np.mean(val_losses), np.mean(train_losses)

In [68]:
def regularization_path(X, y, alphas):
    """Samples a regularization path given X, y and alphas (regularization
    parameters) for a linear, l2 regularized model.""" 

    # Run for different alphas (regularization parameters):
        # Train regularized model
        # estimate performance with k-fold cross-validation
    # Plot regularization path
    # Return best alpha or model
    
    pass

In [None]:
# For each dataset:
    # Plot data
    # Find best regularized model
    # Plot regularization path
    # Plot best model fit