In [1]:
import pandas as pd
import numpy as np

import dask
import dask_ml

from sklearn.linear_model import LinearRegression, Ridge, Lasso, Huber
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import KFold, train_test_split

In [2]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [3]:
def cross_validate_errors(modeldata, modeltarget, model, log = False):  
    """
    Function to take a model, folds and data/target and prints the train error and validation error for each fold
    """
    folds = KFold(n_splits=10, shuffle=True, random_state=15)
    train_e_list = []
    val_e_list = []
    for data_idx, val_idx in folds.split(modeldata,modeltarget):
        trainX = modeldata.iloc[data_idx]
        testX = modeldata.iloc[val_idx]
        trainy = modeltarget.iloc[data_idx]
        testy = modeltarget.iloc[val_idx]
        
        train_e, val_e = calc_metrics(trainX, trainy, testX, testy, model)
        if log == True:
            train_e = ((train_e)**-1)-1
            val_e = ((val_e)**-1)-1
        
        train_e_list.append(train_e)
        val_e_list.append(val_e)
        
    terror = np.mean(train_e_list)
    verror = np.mean(val_e_list)
    return print(str(round(terror,6)) + ' Training Error, ' + str(round(verror,6)) + ' Valid error')

In [4]:
def train_validate_errors(modeldata, modeltarget, model, val_size =0.1, random = 42):
    '''
    Funtion to take a model, split it into a train, test, and validation set and prints the train error and validation error for the train and validation
    '''
    X_trainval, X_test, y_trainval, y_test = train_test_split(modeldata, modeltarget, shuffle = True, test_size = val_size, random_state = random)
    
    del X_test
    del y_test
    
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, shuffle = True, test_size = val_size, random_state = random)
    
    train_e, val_e = calc_metrics(X_train, y_train, X_val, y_val, model)
    
    return print(str(round(train_e,6)) + ' Training Error, ' + str(round(val_e,6)) + ' Valid error')
    
    

In [5]:
data = pd.read_pickle('/home/michael/Documents/Projects/divvydataproject/data/fulldataagg.pkl')
data = data[data['datetime'].dt.year > 2013]

In [6]:
data.head()

Unnamed: 0,datetime,to_station_id,temp,precip,trips,location_soldierfield,location_unitedcenter,location_uscellular,location_wrigley
318445,2014-01-01,135,10.0,0.04,1.0,0.0,0.0,0.0,0.0
318446,2014-01-01,135,10.0,0.0,1.0,0.0,0.0,0.0,0.0
318447,2014-01-01,135,10.0,0.0,1.0,0.0,0.0,0.0,0.0
318448,2014-01-01,339,10.0,0.04,1.0,0.0,0.0,0.0,0.0
318449,2014-01-01,339,10.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
modeldata = data[['temp','precip','location_soldierfield','location_unitedcenter','location_uscellular','location_wrigley']]
modeltarget = data[['trips']]
del data

In [8]:
alpha = [1e-6, 1e-5,1e-4, 1e-3, 1e-2, 1e-1, 1]#, 1e1,1e2]
for alpha in alpha:
    lasso = Lasso(alpha=alpha, fit_intercept=True, random_state=77)
    print('Alpha: '+str(alpha))
    cross_validate_errors(modeldata, modeltarget, lasso)
    print(lasso.fit(modeldata,modeltarget).coef_)

Alpha: 1e-06
4.077142 Training Error, 4.077146 Valid error
[ 0.01772624 -0.72503658  1.29436491 -0.11329335 -0.60870187  7.27292922]
Alpha: 1e-05
4.077154 Training Error, 4.077158 Valid error
[ 0.01772587 -0.72170777  0.44404845 -0.0523417  -0.50686152  7.17533817]
Alpha: 0.0001
4.077311 Training Error, 4.077314 Valid error
[ 0.01772232 -0.68830802  0.         -0.         -0.          6.19938229]
Alpha: 0.001
4.082461 Training Error, 4.082462 Valid error
[ 0.01768985 -0.35531971  0.         -0.         -0.          0.        ]
Alpha: 0.01
4.083514 Training Error, 4.083514 Valid error
[ 0.0176072 -0.         0.        -0.        -0.         0.       ]
Alpha: 0.1
4.083536 Training Error, 4.083537 Valid error
[ 0.01739978 -0.          0.         -0.         -0.          0.        ]
Alpha: 1
4.085818 Training Error, 4.085819 Valid error
[ 0.01532558 -0.          0.         -0.         -0.          0.        ]


In [9]:
lasso = Lasso(alpha=1e6, fit_intercept=True, random_state=77)
ss = StandardScaler()
m1 = make_pipeline(ss, lasso)
cross_validate_errors(modeldata, modeltarget, m1)

4.999059 Training Error, 4.999059 Valid error


In [11]:
lin = LinearRegression(fit_intercept=True, n_jobs = -1)
ss = StandardScaler()
m1 = make_pipeline(ss, lin)
cross_validate_errors(modeldata, modeltarget, m1)

4.80802 Training Error, 4.808026 Valid error


In [8]:
lasso = Lasso(alpha=1e6, fit_intercept=True, random_state=77)

m1 = make_pipeline(PolynomialFeatures(2), lasso)
cross_validate_errors(modeldata, modeltarget, m1)

MemoryError: 