# Class Session 3 - Model Selection

Let's start by loading the data and looking at the first few rows.

In [1]:
import pandas as pd

file_name = 'bike-data.csv'
df = pd.read_csv(file_name)
df = df.sample(frac=1, random_state=42)
df.head()

Unnamed: 0,instant,season,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,count
193,194,3,7,0,4,1,1,0.715833,0.529583,0.146775,7446
33,34,1,2,0,5,1,1,0.313333,0.526667,0.178496,4151
15,16,1,1,1,1,0,1,0.19,0.5225,0.231358,2298
310,311,4,11,0,2,1,1,0.280833,0.567083,0.173513,5686
57,58,1,2,0,1,1,1,0.366667,0.490833,0.268033,4322


The code below includes a linear regression model with one column but multiple features created with that column.

In [3]:
from ipywidgets import interact
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import ipywidgets as widgets
import numpy as np

def add_columns(X, n_terms):
    new_X = X.copy()
    for i in range(2, n_terms+1):
        new_X[f'x{i}'] = X['temp'] ** i
    return new_X

def run_simple_regression(n_terms, min_samples, plot_train, plot_test, plot_tree, plot_linear):
    df_train = df[:300]
    df_test = df[300:]
    # Split data
    X_train = df_train[['temp']].copy()
    y_train = df_train['count']
    X_test = df_test[['temp']].copy()
    y_test = df_test['count']
    X_train.min()
    min_val = round(min(X_test.temp.min(), X_train.temp.min()), 2)
    max_val = round(max(X_test.temp.max(), X_train.temp.max()), 2)
    
    # Plot data
    fig, ax = plt.subplots(figsize=(8, 7), dpi=80)
    if plot_train:
        ax.scatter(X_train.temp, y_train, label="Train", color="orange")
    if plot_test:
        ax.scatter(X_test.temp, y_test, label="Test", color="blue")
    X_pred = pd.DataFrame(np.arange(min_val, max_val, 0.001), columns=["temp"])
    # Tree model
    if plot_tree:
        tree_model = DecisionTreeRegressor(min_samples_leaf=min_samples, random_state=42)
        tree_model = tree_model.fit(X_train, y_train)
        tree_pred = tree_model.predict(X_pred)
        ax.plot(X_pred.temp, tree_pred, color="green", label="Tree prediction")
        if plot_train:
            mae = (tree_model.predict(X_train) - y_train).abs().mean()
            print(f"MAE of Tree Model (train): {mae:.2f}")
        if plot_test:
            mae = (tree_model.predict(X_test) - y_test).abs().mean()
            print(f"MAE of Tree Model (test): {mae:.2f}")
    # Linear model
    if plot_linear:
        X_train_linear = add_columns(X_train, n_terms)
        X_test_linear = add_columns(X_test, n_terms)
        linear_model = LinearRegression()
        linear_model = linear_model.fit(X_train_linear, y_train)
        X_pred = add_columns(X_pred, n_terms)
        linear_pred = linear_model.predict(X_pred)
        ax.plot(X_pred.temp, linear_pred, color="red", label="Linear prediction")
        if plot_train:
            mae = (linear_model.predict(X_train_linear) - y_train).abs().mean()
            print(f"MAE of Linear Model (train): {mae:.2f}")
        if plot_test:
            mae = (linear_model.predict(X_test_linear) - y_test).abs().mean()
            print(f"MAE of Linear Model (test): {mae:.2f}")
    plt.xlabel("Temperature")
    plt.ylabel("Bike demand")
    plt.legend()
    #plt.axhline(y=6000, color='black', linestyle='--')
    #plt.savefig("bike_demand.png", dpi=300)
        

interact(run_simple_regression, 
         n_terms=widgets.IntSlider(min=1, max=10, step=1, value=1),
         min_samples=widgets.IntSlider(min=1, max=101, step=5, value=100),
         plot_train=True, plot_test=False, plot_linear=True, plot_tree=False);

interactive(children=(IntSlider(value=1, description='n_terms', max=10, min=1), IntSlider(value=100, descripti…