In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

## Data processing

In [3]:
train = pd.read_csv('train_clean.csv', index_col=0)
test = pd.read_csv('test_clean.csv', index_col=0)

In [4]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()
cat_feats = cat_feats + ['YearBuilt','KitchenQual','GarageCars', 'BedroomAbvGr']

In [5]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')
num_cols.remove('PID')

## Simple linear model

In [6]:
def simple_linear_model_score(train, test, cols, target):
    ols = linear_model.LinearRegression()
    X = train[cols]
    cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
    transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    y = np.log(train[target])
    
    ols.fit(X, y)
    train_score = ols.score(X, y)
    
    X_tst = test[cols]
    X_tst = transformer.transform(X_tst)
    y_tst = np.log(test[target])
    test_score = ols.score(X_tst, y_tst)
    
    return train_score, test_score

The kitchen sink model of every feature over learns. And has a negative test score

In [7]:
simple_linear_model_score(train, test, cat_feats+num_cols, 'SalePrice')

(0.9577940526767014, -50366487498.91418)

The numerical kitchen sink model is very successful

In [8]:
simple_linear_model_score(train, test, num_cols, 'SalePrice')

(0.9290445015300175, 0.8849212267144508)

The numerical kitchen sink model gets better when we selectively pick our favorite categorical features

In [9]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']
simple_linear_model_score(train, test, num_cols+fav_cat, 'SalePrice')

(0.9418781919970982, 0.8991659160993932)

We can see that as we add more and more numerical features our model gets better.

In [10]:
k=100
cols = train.corr().nlargest(k, 'SalePrice')['SalePrice'].index
cols = cols[1:]
for num_features in range(1,len(cols)):
    print(cols[num_features])
    print(simple_linear_model_score(train, test, cols[1:1+num_features], 'SalePrice'))

GrLivArea
(0.5361242929283243, 0.5052486795731859)
ExterQual
(0.7034962809602388, 0.6644905975921633)
TotalBsmtSF
(0.7792598241555013, 0.7229657933399387)
KitchenQual
(0.7974247390402703, 0.7489550650148744)
1stFlrSF
(0.7975069805128663, 0.7497354661616471)
GarageArea
(0.8182544807598195, 0.779205489561503)
GarageCars
(0.8229613212881246, 0.7862240548493213)
BsmtQual
(0.8367316241545875, 0.8093688234853456)
YearBuilt
(0.8496799637760847, 0.8201252110298075)
FullBath
(0.8515427358211811, 0.8194532379214836)
GarageFinish
(0.8546721816101726, 0.8239315804336104)
FireplaceQu
(0.862791589110423, 0.8355690508933211)
MasVnrArea
(0.863014924192381, 0.8357178472479138)
TotRmsAbvGrd
(0.8634271653730479, 0.8341419606401707)
YearRemodAdd
(0.8695869148482882, 0.8386413512424243)
Fireplaces
(0.8735302259811796, 0.8403365999459429)
BsmtFinSF1
(0.8805143918287985, 0.8482424745087166)
HeatingQC
(0.8819825209355477, 0.8512038129217325)
BsmtExposure
(0.8837381955399547, 0.8524812209245907)
LotFrontage
(0

## Problem section

Can do lasso feature selction. Train a linear model with those selected features. But I get a dimension mismatch when trying to score the test dataset. 

In [11]:
scaler = StandardScaler(with_mean=False)

In [12]:
ols = linear_model.LinearRegression()

In [13]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)


0.9036774202856266

In [14]:
X_tst = test.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
y_tst = np.log(test['SalePrice'])
X_tst = selector.fit_transform(X_tst, y_tst)



The problem is that the number of features between X and X_tst are not equal 11=/= 18

In [15]:
data_shapes = {'X.shape': X.shape, 'y.shape':y.shape, 
               'X_tst.shape':X_tst.shape, 'y_tst.shape':y_tst.shape}
for data_shape in data_shapes.keys():
    print(f'{data_shape}: {data_shapes[data_shape]}')

X.shape: (1871, 11)
y.shape: (1871,)
X_tst.shape: (624, 18)
y_tst.shape: (624,)


In [16]:
ols.score(X_tst,y_tst)

ValueError: dimension mismatch