In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
train = pd.read_csv('train_clean.csv', index_col=0)

In [4]:
cols_na = train.loc[:,train.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [5]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()

In [7]:
cat_feats = cat_feats + ['YearBuilt','KitchenQual','GarageCars', 'BedroomAbvGr']

In [10]:
ols = linear_model.LinearRegression()

In [11]:
# Note on why TotalBsmt... later
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.8445232571694062

In [12]:
# 0.9580850723384223

In [13]:
# 0.9531732835572826

In [14]:
test = pd.read_csv('test_clean.csv', index_col=0)

In [15]:
test.head()

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,903401020,1820,184000,MSSubClass_50,RL,60.0,9120,Pave,2,Reg,...,0,0,0,GdPrv,,0,6,2008,WD,Normal
1,909100080,1296,104000,MSSubClass_30,RL,67.0,4853,Pave,0,Reg,...,0,0,0,MnPrv,,0,5,2010,WD,Normal
2,531385020,1434,189000,MSSubClass_20,RL,65.0,8529,Pave,0,IR1,...,0,0,0,,,0,4,2009,WD,Normal
3,907130110,1499,187000,MSSubClass_60,RL,65.0,12438,Pave,0,IR1,...,0,0,0,,,0,8,2006,WD,Normal
4,924152030,1797,231000,MSSubClass_60,RL,74.0,12961,Pave,0,Reg,...,0,0,0,,,0,3,2010,WD,Normal


In [16]:
cols_na = test.loc[:,test.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [17]:
X_tst = test.drop(['SalePrice','TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
y_tst = np.log(test['SalePrice'])
ols.score(X_tst, y_tst)

0.8044723701956606

In [18]:
# 0.9533403078902463

In [19]:
X = train[['GrLivArea']]
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.5361242929283243

In [20]:
X_tst = test[['GrLivArea']]
y_tst = np.log(test['SalePrice'])
ols.score(X_tst, y_tst)

0.5052486795731859

In [21]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()

In [22]:
num_cols

['PID',
 'GrLivArea',
 'SalePrice',
 'LotFrontage',
 'LotArea',
 'Alley',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'MiscVal',
 'MoSold',
 'YrSold']

In [23]:
num_cols.remove('SalePrice')
X = train.loc[:,num_cols]
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.9290816310107534

In [24]:
X_t = test.loc[:,num_cols]
y_t = np.log(test['SalePrice'])
ols.score(X_t, y_t)

0.8848845580368826

In [25]:
X = train.loc[:,num_cols+['Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.9420283906232196

In [26]:
X = test.loc[:,num_cols+['Neighborhood']]
X = transformer.transform(X)
y = np.log(test['SalePrice'])
ols.score(X, y)

0.8995024460606353

In [27]:
def simple_linear_model_score(train, test, cols, target):
    ols = linear_model.LinearRegression()
    X = train[cols]
    cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
    transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    y = np.log(train[target])
    
    ols.fit(X, y)
    train_score = ols.score(X, y)
    
    X_tst = test[cols]
    X_tst = transformer.transform(X_tst)
    y_tst = np.log(test[target])
    test_score = ols.score(X_tst, y_tst)
    
    return train_score, test_score

In [28]:
train_score, test_score = simple_linear_model_score(train,test,['GrLivArea'],'SalePrice')
print(train_score)
print(test_score)

0.5361242929283243
0.5052486795731859


In [29]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']

In [30]:
simple_linear_model_score(train, test, num_cols + fav_cat, 'SalePrice')

(0.9420285807108666, 0.8994953295914224)

In [31]:
top_9 = ['OverallQual', 'GrLivArea', 'ExterQual', 'KitchenQual',
       'TotalBsmtSF', '1stFlrSF', 'GarageArea', 'GarageCars', 'BsmtQual']
simple_linear_model_score(train, test, top_9, 'SalePrice')

(0.8691553437921314, 0.8278855701677815)

In [32]:
k=100
cols = train.corr().nlargest(k, 'SalePrice')['SalePrice'].index
cols = cols[1:]
for num_features in range(1,len(cols)):
    print(cols[num_features])
    print(simple_linear_model_score(train, test, cols[1:1+num_features], 'SalePrice'))

GrLivArea
(0.5361242929283243, 0.5052486795731859)
ExterQual
(0.7034962809602388, 0.6644905975921633)
TotalBsmtSF
(0.7792598241555013, 0.7229657933399387)
KitchenQual
(0.7974247390402703, 0.7489550650148744)
1stFlrSF
(0.7975069805128663, 0.7497354661616471)
GarageArea
(0.8182544807598195, 0.779205489561503)
GarageCars
(0.8229613212881246, 0.7862240548493213)
BsmtQual
(0.8367316241545875, 0.8093688234853456)
YearBuilt
(0.8496799637760847, 0.8201252110298075)
FullBath
(0.8515427358211811, 0.8194532379214836)
GarageFinish
(0.8546721816101726, 0.8239315804336104)
FireplaceQu
(0.862791589110423, 0.8355690508933211)
MasVnrArea
(0.863014924192381, 0.8357178472479138)
TotRmsAbvGrd
(0.8634271653730479, 0.8341419606401707)
YearRemodAdd
(0.8695869148482882, 0.8386413512424243)
Fireplaces
(0.8735302259811796, 0.8403365999459429)
BsmtFinSF1
(0.8805143918287985, 0.8482424745087166)
HeatingQC
(0.8819825209355477, 0.8512038129217325)
BsmtExposure
(0.8837381955399547, 0.8524812209245907)
LotFrontage
(0

In [33]:
scaler = StandardScaler(with_mean=False)

In [34]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

lasso.fit(X, y)
lasso.score(X, y)

0.735520076397353

In [35]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)

0.8596901489956692

In [36]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

feat_names = transformer.get_feature_names()

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

mask = selector.get_support()

lasso_feats = [a for a, b in zip(feat_names, mask) if b]
simple_linear_model_score(train, test, lasso_feats, 'SalePrice')


(0.8596901489956691, 0.818522716203596)

we have feature selector. 

In [37]:
len(lasso_feats)

5

## work here lasso test alpha

at a stopping point of confusion. I don't know how to do a transformer on the test dataset well enough with this lasso.

In [38]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

feat_names = transformer.get_feature_names()

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

mask = selector.get_support()

lasso_feats=[a for a, b in zip(feat_names, mask) if b]

X_tst = test.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
y_test = np.log(test['SalePrice'])
selector.fit_transform(X_tst, y_test)
 
    

<624x18 sparse matrix of type '<class 'numpy.float64'>'
	with 6806 stored elements in Compressed Sparse Row format>

In [45]:
lasso_feats

['Cat__x1_RM',
 'GrLivArea',
 'OverallQual',
 'YearRemodAdd',
 'ExterQual',
 'BsmtQual',
 'BsmtFinSF1',
 '1stFlrSF',
 'Fireplaces',
 'GarageFinish',
 'GarageArea']

In [44]:
# X_tst = test.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
# X_tst = transformer.transform(X_tst)
# y_test = np.log(test['SalePrice'])
# X_tst = selector.fit_transform(X_tst, y_test)

ols.fit(X,y)
ols.score(X,y)
# ols.score(X_tst,y_test)
# lasso.fit(X, y)
# lasso.score(X, y)
# lasso.score(X_tst,y_test)


0.9628237038843959

In [40]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)

0.9036774202856266

In [41]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

lasso.fit(X,y)
lasso.score(X,y)

0.8552536493265849

In [42]:
cat_feats[20]

'BsmtFinType2'