In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold , cross_val_score, train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error


In [2]:
df = pd.read_csv('bpp_training_data.csv', index_col=False)
last_column = df.pop('SalePrice')
df["SalePrice"] = last_column

del df['SalesID']
del df['MachineID']

X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

df

Unnamed: 0,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,fiBaseModel,fiSecondaryDesc,fiModelSeries,...,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,saleYear,saleMonth,saleDay,saleDayOfYear,SalePrice
0,8434,132,17,1974,17,0,4593,1744,0,0,...,1,8,6,0,0,1989,1,17,17,9500.0
1,10150,132,31,1980,31,0,1820,559,0,0,...,0,0,0,4,2,1989,1,31,31,14000.0
2,4139,132,31,1978,31,0,2348,713,59,0,...,1,8,6,0,0,1989,1,31,31,50000.0
3,8591,132,31,1980,31,0,1819,558,0,0,...,0,0,0,4,2,1989,1,31,31,16000.0
4,4089,132,31,1984,31,0,2119,683,21,0,...,1,6,5,0,0,1989,1,31,31,22000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,6272,136,78,1000,78,0,4215,1500,0,0,...,0,0,0,0,0,2009,3,19,78,16500.0
299996,12919,136,78,1000,78,0,4216,1501,0,0,...,0,0,0,0,0,2009,3,19,78,17000.0
299997,9580,136,78,2005,78,0,4514,1684,0,0,...,0,0,0,0,0,2009,3,19,78,11500.0
299998,11398,132,78,2005,78,3,1202,354,75,0,...,1,6,6,0,0,2009,3,19,78,32000.0


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [4]:
lin_reg = linear_model.LinearRegression().fit(X_train, Y_train)

lin_reg_pred = lin_reg.predict(X_test)
MAE_lin = mean_absolute_error(lin_reg_pred, Y_test)
score_lin = lin_reg.score(X_test, Y_test)

In [5]:
lasso_reg = linear_model.Lasso(alpha=50, max_iter=10000).fit(X_train, Y_train)
lasso_reg_pred = lasso_reg.predict(X_test)
MAE_lasso = mean_absolute_error(lasso_reg_pred, Y_test)
score_lasso = lasso_reg.score(X_test, Y_test)

ridge_reg = linear_model.Ridge(alpha=50, max_iter=10000).fit(X_train, Y_train)
ridge_reg_pred = ridge_reg.predict(X_test)
MAE_ridge = mean_absolute_error(ridge_reg_pred, Y_test)
score_ridge = ridge_reg.score(X_test, Y_test)

In [6]:
A = cross_val_score(lin_reg, X, Y, cv=10)
B = cross_val_score(lasso_reg, X, Y, cv=10)
C = cross_val_score(ridge_reg, X, Y, cv=10)

cv_score_lin = np.mean(A)
cv_score_lasso = np.mean(B)
cv_score_ridge = np.mean(C)

In [7]:
metrics = pd.DataFrame(columns=["Mean Absolute Error", "R2 Score", "CV Score"])
metrics.at["Linear Regression",'Mean Absolute Error'] = MAE_lin
metrics.at["Linear Regression",'R2 Score'] = score_lin
metrics.at["Linear Regression",'CV Score'] = cv_score_lin

metrics.at["Linear Regression with L1",'Mean Absolute Error'] = MAE_lasso
metrics.at["Linear Regression with L1",'R2 Score'] = score_lasso
metrics.at["Linear Regression with L1",'CV Score'] = cv_score_lasso

metrics.at["Linear Regression with L2",'Mean Absolute Error'] = MAE_ridge
metrics.at["Linear Regression with L2",'R2 Score'] = score_ridge
metrics.at["Linear Regression with L2",'CV Score'] = cv_score_ridge

metrics.index.name = "Model"
metrics

Unnamed: 0_level_0,Mean Absolute Error,R2 Score,CV Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Linear Regression,12524.966689,0.405558,0.385364
Linear Regression with L1,12632.090103,0.398492,0.378885
Linear Regression with L2,12529.705832,0.405514,0.385373
