In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm


from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# if necessary, download 'US' library dependency
#!pip install US
from clean_data import *


In [3]:
# Helper functions
def summary_model(X, y, label='scatter'):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    summary = model.summary()
    return summary

def plot_model(X, y, label='Residual Plot'):
    model = sm.OLS(y, X).fit()
    student_resids = model.outlier_test()['student_resid']
    y_hats = model.predict(X)

    plt.scatter(y_hats, student_resids, alpha = .35, label=label)
    plt.legend()
    plt.show()

In [4]:
# Load in training data
df = pd.read_csv('data/Train.csv', low_memory=False)


In [5]:
# Clean the Data with Helper Script
#df_clean = 
clean_df = clean_df(df)

In [6]:
clean_df.head()

Unnamed: 0,SalePrice,MachineID,ModelID,datasource,YearMade,MachineHoursCurrentMeter,state,Ripper: None or Unspecified,Ripper: Yes,Ripper: Multi Shank,...,ProductSize: Large,ProductSize: Compact,HorsePower,Vehicle Type: Track Excavators,Vehicle Type: Track Type Tractors,Vehicle Type: Backhoe Loaders,Vehicle Type: Wheel Loader,Vehicle Type: Skid Steer Loaders,Vehicle Type: Motor Graders,yearsold
0,66000,999089,3157,121,2004.0,68.0,1,0.0,0.0,0.0,...,0.0,0.0,120.0,0.0,0.0,0.0,1.0,0.0,0.0,2006
1,57000,117657,77,121,1996.0,4640.0,37,0.0,0.0,0.0,...,0.0,0.0,175.0,0.0,0.0,0.0,1.0,0.0,0.0,2004
2,10000,434808,7009,121,2001.0,2838.0,36,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2004
3,38500,1026470,332,121,2001.0,3486.0,48,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
4,11000,1057373,17311,121,2007.0,722.0,36,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2009


Unnamed: 0,MachineID,ModelID,datasource,YearMade,MachineHoursCurrentMeter,state,Ripper: None or Unspecified,Ripper: Yes,Ripper: Multi Shank,Ripper: Single Shank,...,ProductSize: Large,ProductSize: Compact,HorsePower,Vehicle Type: Track Excavators,Vehicle Type: Track Type Tractors,Vehicle Type: Backhoe Loaders,Vehicle Type: Wheel Loader,Vehicle Type: Skid Steer Loaders,Vehicle Type: Motor Graders,yearsold
0,999089,3157,121,2004.0,68.000000,1,0.0,0.0,0.0,0.0,...,0.0,0.0,120.000000,0.0,0.0,0.0,1.0,0.0,0.0,2006
1,117657,77,121,1996.0,4640.000000,37,0.0,0.0,0.0,0.0,...,0.0,0.0,175.000000,0.0,0.0,0.0,1.0,0.0,0.0,2004
2,434808,7009,121,2001.0,2838.000000,36,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2004
3,1026470,332,121,2001.0,3486.000000,48,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
4,1057373,17311,121,2007.0,722.000000,36,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,0.0,0.0,0.0,0.0,1.0,0.0,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,1840702,21439,149,2005.0,3457.955353,24,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
401121,1830472,21439,149,2005.0,3457.955353,24,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
401122,1887659,21439,149,2005.0,3457.955353,24,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011
401123,1903570,21435,149,2005.0,3457.955353,12,0.0,0.0,0.0,0.0,...,0.0,0.0,165.216455,1.0,0.0,0.0,0.0,0.0,0.0,2011


In [16]:
# Split up Data Between Features (X) and SalePrice, i.e. the Target Values (y))
X = clean_df.drop(columns=['SalePrice'])
y = clean_df['SalePrice']

summary_model(X, y)

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.585
Model:,OLS,Adj. R-squared:,0.585
Method:,Least Squares,F-statistic:,24600.0
Date:,"Fri, 17 Apr 2020",Prob (F-statistic):,0.0
Time:,12:32:20,Log-Likelihood:,-4421900.0
No. Observations:,401125,AIC:,8844000.0
Df Residuals:,401101,BIC:,8844000.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.052e+05,7514.560,-107.152,0.000,-8.2e+05,-7.9e+05
MachineID,-0.0064,5.81e-05,-110.521,0.000,-0.007,-0.006
ModelID,0.0052,0.004,1.279,0.201,-0.003,0.013
datasource,129.3491,2.937,44.039,0.000,123.592,135.106
YearMade,1563.5934,3.669,426.118,0.000,1556.402,1570.785
MachineHoursCurrentMeter,-0.0045,0.001,-3.127,0.002,-0.007,-0.002
state,-14.8631,1.422,-10.453,0.000,-17.650,-12.076
Ripper: None or Unspecified,9576.4724,347.934,27.524,0.000,8894.532,1.03e+04
Ripper: Yes,2.77e+04,398.532,69.496,0.000,2.69e+04,2.85e+04

0,1,2,3
Omnibus:,67000.364,Durbin-Watson:,1.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,223367.24
Skew:,0.846,Prob(JB):,0.0
Kurtosis:,6.241,Cond. No.,1.29e+21


In [None]:
plot_model(X, y)

In [None]:
# Plotting the histogram for the Balance (y)
y.hist(bins=100)
plt.show()

In [None]:

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)
test_cv_errors, train_cv_errors = np.empty(n_folds), np.empty(n_folds)

for idx, (train, test) in enumerate(kf.split(X_train)):
    model = Ridge(alpha=.5)
    model.fit(X[train], y[train])
    y_hat = model.predict(X[test])
    y_train = model.predict(X[train])
    
    train_cv_errors[idx] = np.sqrt(mean_squared_error(y[train], y_train))
    test_cv_errors[idx] = np.sqrt(mean_squared_error(y[test], y_hat))

train_cv_errors, test_cv_errors