We are going to explore data sampled from a true (usually unknown) function that relates house price (y in dollars) to house size (x in square feet).

Let's assume the true function is a simple curve:

y = a.log(bx^2) + c, defined for x,y subset positive real numbers.

And that we can collect data with a normally distributed measurement accuracy of +-d, so our measured data is sampled from this data generating model

y = a.log(bx^2)  + c + N(d)

Because of this measurement accuracy, with a fixed set of data and a perfect model in our hypothesis set we would not be able to fully resolve y due to the measurement accuracy - this is known as the irreducible error.  We will always have uncertainty.  

Typically in machine learning we work with a fixed sample of data of size n.  For demonstration let's fix n at 20 and generate 20 random points from our data generating model.  

In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

def fit(n=20):
    # build train and test data
    x = np.random.random(n)*20
    y = np.log(10 * x ** 2) + 2 + np.random.normal(scale=1, size=n)
    df = pd.DataFrame(data = zip(x,y), columns=['x','y'])
    df_train, df_test = train_test_split(df, test_size = 0.5, random_state=72)
    # ground truth
    #xt = np.arange(min(df_train.x),max(df_train.x),0.1)
    xt=  np.arange(0,20,0.1)
    yt = np.log(10 * xt ** 2) + 2
    # function to fit model
    def poly(degree, X, y):
        model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
        return model.fit(X, y)
    # fit models of degree 1 through 10
    m = [ poly(d, df_train.x.reshape(-1,1), df_train.y) for d in range(1,11)]
    f = [ m[d-1].predict(xt.reshape(-1,1)) for d in range(1,11)]
    p_test = [ m[d-1].predict(df_test.x.reshape(-1,1)) for d in range(1,11)]
    p_train = [ m[d-1].predict(df_train.x.reshape(-1,1)) for d in range(1,11)]
    return({'truth':{'x':xt,'y':yt}, 'fit':f, 'models':m, 'data':{'train':df_train, 'test':df_test}, 'predict_train':p_train, 'predict_test':p_test})


For machine learning we further divide this data into a train and test set - in this case let's take a 50%:50% split - we will keep our test data to measure the out of sample fit of the models we fit to the training data.

Let's plot the training data and overlay the true data generating curve:

In [11]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import Range1d
from bokeh.io import gridplot
from bokeh.charts import Bar

def plot_truth(fit):
    p = figure(plot_width=300, plot_height=300)
    p.circle(x=fit['data']['train'].x, y=fit['data']['train'].y, size=10, color="navy", alpha=0.2)
    p.line(x=fit['truth']['x'], y=fit['truth']['y'], line_width=2)
    p.y_range = Range1d(0,12)
    p.x_range = Range1d(0,20)
    return(p)

def plot_fit(degree, fit):
    # train + fit
    p1 = figure(plot_width=300, plot_height=300, title="Degree " + str(degree) + " Train")
    p1.circle(x=fit['data']['train'].x, y=fit['data']['train'].y, size=10, color="navy", alpha=0.2)
    p1.line(x=fit['truth']['x'], y=fit['fit'][degree-1], color = 'red')
    p1.y_range = Range1d(0,12)
    p1.x_range = Range1d(0,20)
    # test + fit
    p2 = figure(plot_width=300, plot_height=300, title="Degree " + str(degree) + " Test")
    p2.circle(x=fit['data']['test'].x, y=fit['data']['test'].y, size=10, color="navy", alpha=0.2)
    p2.line(x=fit['truth']['x'], y=fit['fit'][degree-1], color = 'red')
    p2.y_range = Range1d(0,12)
    p2.x_range = Range1d(0,20)
    # test and train MSE
    #mse = { 'group' : ['train', 'test'],
    #        'error': [ sum((f['predict_train'][degree-1] - f['data']['train'].y)**2),
    #                  sum((f['predict_test'][degree-1] - f['data']['test'].y)**2) ] }
    #p3 = Bar(mse, label='group', values='error', plot_width=300, plot_height=300, title="Degree " + str(degree) + " MSE")
    p = gridplot([[p1,p2]])
    return(p)
    
output_notebook()
f = fit(100)
#show(plot_truth(f))
show(plot_fit(1,f))
show(plot_fit(2,f))
show(plot_fit(3,f))
show(plot_fit(4,f))
show(plot_fit(5,f))
show(plot_fit(6,f))
show(plot_fit(10,f))



<bokeh.io._CommsHandle at 0x7fa0c50eaa90>

In [10]:
def plot_error(fit):
    mse_train = [ sum((f['predict_train'][degree-1] - f['data']['train'].y)**2) for degree in range(1,11)]
    mse_test = [ sum((f['predict_test'][degree-1] - f['data']['test'].y)**2) for degree in range(1,11)]
    x = range(1,11)
    p = figure(plot_width=300, plot_height=300, title="Test vs Train MSE")
    p.line(x=x, y=mse_train, color = 'blue')
    p.line(x=x, y=mse_test, color = 'red')
    p.y_range = Range1d(0,100)
    return(p)


output_notebook()
show(plot_error(f))



<bokeh.io._CommsHandle at 0x7fa0c595d650>