In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re

In [12]:
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
from clean_data import clean_df

In [21]:
def rmsle(actual, predictions):
    log_diff = np.log(predictions+1) - np.log(actual+1)
    return np.sqrt(np.mean(log_diff**2))

In [4]:
auction_train = pd.read_csv('data/Train.csv', low_memory=False)

In [5]:
clean_auction = clean_df(auction_train)

In [8]:
clean_auction.drop(columns=['MachineID', 'ModelID', 'datasource'], inplace=True)

In [9]:
clean_auction.head()

Unnamed: 0,SalePrice,YearMade,MachineHoursCurrentMeter,state,Ripper: None or Unspecified,Ripper: Yes,Ripper: Multi Shank,Ripper: Single Shank,ProductSize: Medium,ProductSize: Large / Medium,...,ProductSize: Large,ProductSize: Compact,HorsePower,Vehicle Type: Track Excavators,Vehicle Type: Track Type Tractors,Vehicle Type: Backhoe Loaders,Vehicle Type: Wheel Loader,Vehicle Type: Skid Steer Loaders,Vehicle Type: Motor Graders,yearsold
0,66000,2004.0,68.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,120.0,0.0,0.0,0.0,1.0,0.0,0.0,2006
1,57000,1996.0,4640.0,37,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,175.0,0.0,0.0,0.0,1.0,0.0,0.0,2004
2,10000,2001.0,2838.0,36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2004
3,38500,2001.0,3486.0,48,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
4,11000,2007.0,722.0,36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2009


In [33]:
clean_auction['SalePrice'] = np.log(clean_auction['SalePrice'])

In [34]:
clean_auction.head()

Unnamed: 0,SalePrice,YearMade,MachineHoursCurrentMeter,state,Ripper: None or Unspecified,Ripper: Yes,Ripper: Multi Shank,Ripper: Single Shank,ProductSize: Medium,ProductSize: Large / Medium,...,ProductSize: Large,ProductSize: Compact,HorsePower,Vehicle Type: Track Excavators,Vehicle Type: Track Type Tractors,Vehicle Type: Backhoe Loaders,Vehicle Type: Wheel Loader,Vehicle Type: Skid Steer Loaders,Vehicle Type: Motor Graders,yearsold
0,11.09741,2004.0,68.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,120.0,0.0,0.0,0.0,1.0,0.0,0.0,2006
1,10.950807,1996.0,4640.0,37,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,175.0,0.0,0.0,0.0,1.0,0.0,0.0,2004
2,9.21034,2001.0,2838.0,36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2004
3,10.558414,2001.0,3486.0,48,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
4,9.305651,2007.0,722.0,36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2009


In [35]:
y = np.array(clean_auction['SalePrice'])
X = np.array(clean_auction.drop(columns='SalePrice'))

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
model = Pipeline([('standardize', StandardScaler()),
                   ('regressor', Ridge())])

In [47]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardize',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('regressor',
                 Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001))],
         verbose=False)

In [48]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [49]:
print('Training error: {}'.format(rmsle(y_train, y_hat_train)))
print('Testing error: {}'.format(rmsle(y_test, y_hat_test)))

Training error: 0.03681898138545605
Testing error: 0.036751365890830084


In [50]:
n_folds = 5

kf = KFold(n_splits=n_folds)

test_cv_errors, train_cv_errors = np.empty(n_folds), np.empty(n_folds)

for idx, (train, test) in enumerate(kf.split(X_train)):
    # Split into train and test
    X_cv_train, y_cv_train = X_train[train], y_train[train]
    X_cv_test, y_cv_test = X_train[test], y_train[test]

    # Fit data onto model
    model.fit(X_cv_train, y_cv_train)
    
    # Measure performance
    y_hat_train = model.predict(X_cv_train)
    y_hat_test = model.predict(X_cv_test)
    
    # Calclate the error metrics
    train_cv_errors[idx] = rmsle(y_cv_train, y_hat_train)
    test_cv_errors[idx] = rmsle(y_cv_test, y_hat_test)
    
print("Training CV error: {:2.2f}".format(train_cv_errors.mean()))
print("Test CV error: {:2.2f}".format(test_cv_errors.mean()))

Training CV error: 0.04
Test CV error: 0.04


In [None]:
nalphas = 50
min_alpha_exp = -3
max_alpha_exp = 1.5
nfeatures = 6
coefs = np.zeros((nalphas, nfeatures))
alphas = np.logspace(min_alpha_exp, max_alpha_exp, nalphas)
for i, alpha in enumerate(alphas):
    #model = Pipeline([('standardize', StandardScaler()),
    #                  ('lasso', Lasso(alpha=alpha))])
    model = Lasso(alpha=alpha)
    model.fit(X, y)
    #coefs[i] = model.steps[1][1].coef_
    coefs[i] = model.coef_