In [1]:
import pandas as pd
import numpy as np

In [2]:
# get data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.shape, test.shape

((1460, 81), (1459, 80))

## Step 1: Preprocessing

In [3]:
# remove target y from training data
y = train.pop('SalePrice')
train_id = test.pop('Id')
test_id = train.pop('Id')
train.shape, test.shape

((1460, 79), (1459, 79))

In [4]:
# data preprocessing: remove NaNS, dummify categorical columns
data = pd.concat([train,test])
data = data.fillna("NA")
data = pd.get_dummies(data)

In [5]:
#data.head(10)

In [6]:
X = data.iloc[:len(y),:]
test_data = data.iloc[len(y):,:]
X.shape, test_data.shape

((1460, 5058), (1459, 5058))

In [7]:

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1022, 5058) (1022,)
(438, 5058) (438,)


In [8]:
# data preprocessing: Scaling
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
test_data = ss.transform(test_data)  

## Step2: Building the Model

### Model 1: Basic Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)
model.score(X_train,y_train), model.score(X_test, y_test)

(1.0, -3.4603416799443565)

The model here is overfit, it performs perfectly on the training data, but on the test data its performance is worse than the baseline model. This is expected because the number of features is too large. We need to do regularized regression to solve the overfitting problem.

### Model 2: LassoCV

In [10]:
# create and fit the model
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV()
lasso_cv.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [11]:
# get the score
lasso_cv.score(X_train, y_train), lasso_cv.score(X_test, y_test)

(0.9786201345190817, 0.8835524732378173)

In [12]:
# predicted house prices using our LassoCV model

y_hat = lasso_cv.predict(test_data)
y_hat[:10]

array([118414.27552593, 161386.37301616, 187318.57398658, 195029.13525644,
       205781.10369332, 170111.89188944, 178318.34574767, 161425.53971796,
       196021.79455658, 115258.04815788])

In [13]:
# number of features kept is 369
# number of features eliminated is 4689
sum(lasso_cv.coef_!=0),sum(lasso_cv.coef_==0)

(493, 4565)

In [19]:
lasso_cv.alpha_ , lasso_cv.cv

(598.5042885597512, None)

In [23]:
lasso_cv.selection

'cyclic'

In [24]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

# Perform 5-fold cross validation
scores = cross_val_score(lasso_cv, X_train, y_train, cv=3)

In [25]:
print("Cross-validated scores:", scores)

Cross-validated scores: [0.77800648 0.81293111 0.88503975]


In [26]:
print("Mean of Ccoss-validated scores:", scores.mean())

Mean of Ccoss-validated scores: 0.8253257792344875


### Model 3: RidgeCV

In [29]:
# create and fit the model
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV().fit(X_train, y_train)

In [40]:
ridge_cv.score(X_train, y_train),ridge_cv.score(X_test, y_test)

(0.9999810015957754, 0.785894404882588)

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score, cross_val_predict

scores = cross_val_score(lasso_cv,X_train, y_train,cv=5)
print(scores),print(scores.mean())

In [None]:
# Prediction
# cross_val_predict()