# Grocery Problem: Model Selection

In this notebook, we attempt to train models on the data and using cross validation, attempt to find the best model for the job.

# Imports

In [2]:
import time
import pickle
import numpy as np
import matplotlib as plt
import pandas as pd
from datetime import date

from IPython.display import display # extract a feature record from each date
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from multiprocessing import Pool, cpu_count

## Load Data
We reload the preprocessed data from the previous notebooks.

In [3]:
with open("data/groceries/groceries_dataset.npz", "rb") as f:
    dataset = np.load(f)
    train_ins, train_outs = dataset["train_ins"], dataset["train_outs"]

In [4]:
print(f"Loaded {len(train_ins)} of training examples")

Loaded 10000000 of training examples


## Validation-Train Split
We split the data into train and validation subsets so that we can train on training set and validate model on validation set. We do not shuffle because the data is time series.

We include 20,000 examples in our validation set and leave the rest for training

In [None]:
# cross validation split with no shuffling because data is time series
train_ins, valid_ins, train_outs, valid_outs = train_test_split(
    train_ins, train_outs, test_size=int(2e+5), shuffle=False)

In [None]:
print(f"{len(train_ins)} training examples, {len(valid_ins)}  validation examples")

9800000 training examples, 200000  validation examples


## Model Selection
We try a selection of models:
- linear regression with SGD
- RandomForest
- Neural Network

and perform cross validation using mean squared log error to determine the best performing model

We first train the models:

In [None]:
models = {
    "linear regression": SGDRegressor(verbose=1),
    "random forest": RandomForestRegressor(verbose=1, n_jobs=-1)
}


for name, model in models.items():
    print(f"training {name}...")
    %time model.fit(train_ins, train_outs)