In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#import sys
#sys.path.insert(0,"../src/")

from FrankeFunction import FrankeFunction
from linear_regression import *
from bootstrap import *
from cross_validation import *
from stat_tools import *
import unit_tests

# Make sure things are working as expected
unit_tests.OLS_unit_test()
unit_tests.OLS_SVD_unit_test()
unit_tests.Ridge_unit_test()

# Generate test data
n = 300 # Number of data points

x_dat = np.random.uniform(0, 1, n)
y_dat = np.random.uniform(0, 1, n)
z_dat = FrankeFunction(x_dat, y_dat)

# Set up the scaler
scaler = StandardScaler()



In [5]:
X = design_matrix(x_dat, y_dat, 6)
# Split data, but don't shuffle. OK since data is already randomly sampled! 
# Fasilitates a direct comparrison of the clean & Noisy data
X_train, X_test, z_train, z_test = train_test_split(X, z_dat, test_size = 0.2, shuffle=False)

# Normalize data sets
X_train = scaler.fit_transform(X_train)
X_train[:, 0] = np.ones(X_train.shape[0])
X_test = scaler.fit_transform(X_test)
X_test[:, 0] = np.ones(X_test.shape[0])

z_bootstrap, z_model_train, z_model_test = bootstrap(
    X_train, 
    X_test, 
    z_train, 
    z_test,
    bootstraps = 100,
    regression = linear_regression.OLS_SVD
)

In [8]:
print(z_bootstrap.shape)
print(z_model_train.shape)
print(z_model_test.shape)
print(z_test.shape)

(100, 240)
(100, 240)
(100, 60)
(60,)


In [20]:
print(MSE(z_test,z_model_test))

mse = 0
bias_sq = 0
var = 0
for i in range(100):
    mse += np.mean((z_test - z_model_test[i])**2)
    var += np.mean((z_model_test - np.mean(z_model_test[i]))**2)
    bias_sq += np.mean((z_test - np.mean(z_model_test[i]))**2)

mse /= 100
bias_sq /= 100
var /= 100
print(mse)
print(var)
print(bias_sq)

0.0034787026763840267
0.003478702676384026
0.08007819005762337
0.08114069088013887


In [None]:
def p2_analysis(x, y, z, degrees, N_bootstraps):
    
    columns = [
        "MSE train",
        "MSE test",
        "R2 train",
        "R2 test",
        "Bias train",
        "Bias test",
        "Variance train",
        "Variance test",
    ]
    
    df = pd.DataFrame(columns=columns)
    
    for i, deg in enumerate(degrees):
        X = design_matrix(x, y, deg)
        # Split data, but don't shuffle. OK since data is already randomly sampled! 
        # Fasilitates a direct comparrison of the clean & Noisy data
        X_train, X_test, z_train, z_test = train_test_split(X, z, test_size = 0.2, shuffle=False)
        
        # Normalize data sets
        X_train = scaler.fit_transform(X_train)
        X_train[:, 0] = np.ones(X_train.shape[0])
        X_test = scaler.fit_transform(X_test)
        X_test[:, 0] = np.ones(X_test.shape[0])
        
        output = bootstrap(
            X_train, 
            X_test, 
            z_train, 
            z_test,
            bootstraps = N_bootstraps,
            regression = linear_regression.OLS_SVD
        )

        df = df.append(pd.DataFrame(output, index=[deg]))
    
    return df

degrees = np.arange(1, 20)
N_bootstraps = 100
data_bootstrap = p2_analysis(x_dat, y_dat, z_dat, degrees, N_bootstraps)

plt.figure(figsize=(8,4))
degrees = np.arange(1, 20) 
#plt.semilogy(degrees, data_bootstrap["MSE train"],"o--", label = "MSE TRAIN")
plt.semilogy(degrees, data_bootstrap["MSE test"],"o--", label = "MSE TEST")
plt.semilogy(degrees, data_bootstrap["Variance test"],"o--", label = "VAR TEST")
#plt.semilogy(degrees, data_bootstrap["Variance train"],"o--", label = "VAR TRAIN")
plt.semilogy(degrees, data_bootstrap["Bias test"],"o--", label = "BIAS^2 TEST")
#plt.semilogy(degrees, data_bootstrap["Bias train"],"o--", label = "BIAS^2 TEST")
plt.xlabel("Model Complexity")
plt.ylabel("MSE")
plt.title("Bootstrap")
plt.legend(loc="best")

In [None]:
data_bootstrap