In [3]:
import scipy.stats as stats

In [142]:
n = 500

mu = 100
sigma = 10

slope = 5
intercept = 20

noise = 10

poly_features = 2

models = {'lr': LinearRegression(fit_intercept=True), 
          'ridge': Ridge(fit_intercept=True, alpha=1)}

split = 0.2

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline

import altair as alt

import numpy as np
import pandas as pd

In [157]:
def gen_pipe(regressor, poly_features=2):

    pipe = Pipeline([
        ('poly', PolynomialFeatures(degree=poly_features)),
        ('regressor', regressor)
    ])
    return pipe

def generate_data(mu, sigma, n,  slope, intercept, noise, split):
    dist = stats.norm(mu, sigma)
    X = dist.rvs(n)
    epsilon = stats.norm(2, noise).rvs(n)
    y = slope * X + .5 * X**2 + intercept + epsilon

    X_train, X_test, y_train, y_test = train_test_split(X.reshape(-1, 1), y, test_size=split)
    return X_train, X_test, y_train, y_test



def fit_models(models, X_train, y_train):
    fits = {k: gen_pipe(v).fit(X_train, y_train) for k, v in models.items()}
    print(X_train.shape)

    coefs = {k: {'slope': v['regressor'].intercept_, 'intercept': v['regressor'].coef_[0]} for k, v in fits.items()}

    return fits, coefs

def score_models(fits, X_train, X_test, y_train, y_test):
    
    train_scores = {k: v.score(X_train, y_train) for k, v in fits.items()}
    train_scores['data'] = 'train'
    test_scores = {k: v.score(X_test, y_test) for k, v in fits.items()}
    test_scores['data'] = 'test'
    
    return train_scores, test_scores

def chart_fits(models, model_coefs):
    
    data = pd.DataFrame()
    for model, coefs in model_coefs.items():
        X = np.linspace(-10, 10, 10)
        print(X.reshape(-1, 1).shape)
        y = models[model].predict(X.reshape(-1, 1))
        print(y.shape)
        fit = pd.DataFrame()
        fit['X'] = X
        fit['y'] = y
        fit['model'] = model
        data = data.append(fit)

    
    chart = alt.Chart(data).mark_line().encode(
        x='X',
        y='y',
        color='model'
    )
    
    return chart

def chart_scores(scores):
    score_df = pd.DataFrame(scores)
    score_df.melt(id_vars='data')

    chart  = alt.Chart(score_df.melt(id_vars='data')).mark_bar().encode(
        x='data:N',
        y='value',
        color='data',
        column='variable:N'
    )
    return chart

chart_scores(scores)
    
    
X_train, X_test, y_train, y_test = generate_data(mu, sigma, n, slope, intercept, noise, split)


fits, coefs = fit_models(models, X_train, y_train)

scores = score_models(fits, X_train, X_test, y_train, y_test)

chart = chart_fits(fits, coefs)
chart

(400, 1)
(10, 1)
(10,)
(10, 1)
(10,)


In [102]:
def chart_scores(scores):
    score_df = pd.DataFrame(scores)
    score_df.melt(id_vars='data')

    chart  = alt.Chart(score_df.melt(id_vars='data')).mark_bar().encode(
        x='data:N',
        y='value',
        color='data',
        column='variable:N'
    )
    return chart

chart_scores(scores)