In [52]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [3]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [4]:
train = pd.read_csv('train_clean.csv', index_col=0)

In [6]:
cols_na = train.loc[:,train.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [10]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()

In [47]:
cat_feats = cat_feats + ['YearBuilt','KitchenQual']

In [43]:
sorted(cat_feats)

['BldgType',
 'BsmtFinType1',
 'BsmtFinType2',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'Foundation',
 'Functional',
 'GarageType',
 'Heating',
 'HouseStyle',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSSubClass',
 'MSZoning',
 'MasVnrType',
 'MiscFeature',
 'Neighborhood',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities']

In [14]:
train['GrLivArea'].dtypes

dtype('int64')

In [44]:
sorted(train.select_dtypes(['float64','int64']).columns.to_list())

['1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'Alley',
 'BedroomAbvGr',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtQual',
 'BsmtUnfSF',
 'EnclosedPorch',
 'ExterCond',
 'ExterQual',
 'FireplaceQu',
 'Fireplaces',
 'FullBath',
 'GarageArea',
 'GarageCars',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageYrBlt',
 'GrLivArea',
 'HalfBath',
 'HeatingQC',
 'KitchenAbvGr',
 'KitchenQual',
 'LotArea',
 'LotFrontage',
 'LowQualFinSF',
 'MasVnrArea',
 'MiscVal',
 'MoSold',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PID',
 'PavedDrive',
 'PoolArea',
 'PoolQC',
 'SalePrice',
 'ScreenPorch',
 'TotRmsAbvGrd',
 'TotalBsmtSF',
 'WoodDeckSF',
 'YearBuilt',
 'YearRemodAdd',
 'YrSold']

In [18]:
ols = linear_model.LinearRegression()

In [20]:
# Note on why TotalBsmt... later
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.9580849775963552

In [13]:
# 0.9580850723384223

In [17]:
# 0.9531732835572826

In [21]:
test = pd.read_csv('test_clean.csv', index_col=0)

In [23]:
test.head()

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,903401020,1820,184000,MSSubClass_50,RL,60.0,9120,Pave,2,Reg,...,0,0,0,GdPrv,,0,6,2008,WD,Normal
1,909100080,1296,104000,MSSubClass_30,RL,67.0,4853,Pave,0,Reg,...,0,0,0,MnPrv,,0,5,2010,WD,Normal
2,531385020,1434,189000,MSSubClass_20,RL,65.0,8529,Pave,0,IR1,...,0,0,0,,,0,4,2009,WD,Normal
3,907130110,1499,187000,MSSubClass_60,RL,65.0,12438,Pave,0,IR1,...,0,0,0,,,0,8,2006,WD,Normal
4,924152030,1797,231000,MSSubClass_60,RL,74.0,12961,Pave,0,Reg,...,0,0,0,,,0,3,2010,WD,Normal


In [22]:
cols_na = test.loc[:,test.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [25]:
X_tst = test.drop(['SalePrice','TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
y_tst = np.log(test['SalePrice'])
ols.score(X_tst, y_tst)

-21318036.003695462

In [26]:
# 0.9533403078902463

In [27]:
X = train[['GrLivArea']]
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.5361242929283243

In [28]:
X_tst = test[['GrLivArea']]
y_tst = np.log(test['SalePrice'])
ols.score(X_tst, y_tst)

0.5052486795731859

In [29]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()

In [30]:
X = train.loc[:,num_cols]
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.9625822935301528

In [31]:
X = test.loc[:,num_cols]
y = np.log(test['SalePrice'])
ols.score(X, y)

0.9543194655331502

In [33]:
X = train.loc[:,num_cols+['Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

0.9686600434413519

In [34]:
X = test.loc[:,num_cols+['Neighborhood']]
X = transformer.transform(X)
y = np.log(test['SalePrice'])
ols.score(X, y)

0.9607616402528316

In [48]:
def simple_linear_model_score(train, test, cols, target):
    ols = linear_model.LinearRegression()
    X = train[cols]
    cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
    transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    y = np.log(train[target])
    
    ols.fit(X, y)
    train_score = ols.score(X, y)
    
    X_tst = test[cols]
    X_tst = transformer.transform(X_tst)
    y_tst = np.log(test[target])
    test_score = ols.score(X_tst, y_tst)
    
    return train_score, test_score

In [36]:
train_score, test_score = simple_linear_model_score(train,test,['GrLivArea'],'SalePrice')
print(train_score)
print(test_score)

0.5361242929283243
0.5052486795731859


In [45]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']

In [50]:
simple_linear_model_score(train, test, num_cols + fav_cat, 'SalePrice')

(0.9686600419133471, 0.9607610209654872)

In [56]:
scaler = StandardScaler(with_mean=False)

In [58]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

lasso.fit(X, y)
lasso.score(X, y)

0.7361493786950961

In [62]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)

0.8604843360147527

In [67]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

feat_names = transformer.get_feature_names()

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

mask = selector.get_support()

[a for a, b in zip(feat_names, mask) if b]


['GrLivArea',
 'OverallQual',
 'BsmtQual',
 '1stFlrSF',
 'GarageCars',
 'GarageArea']

In [69]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

feat_names = transformer.get_feature_names()

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.001)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

mask = selector.get_support()

[a for a, b in zip(feat_names, mask) if b]

['Cat__x0_1Fam',
 'Cat__x0_Twnhs',
 'Cat__x1_ALQ',
 'Cat__x1_GLQ',
 'Cat__x1_LwQ',
 'Cat__x1_Rec',
 'Cat__x1_Unf',
 'Cat__x2_BLQ',
 'Cat__x2_GLQ',
 'Cat__x2_Rec',
 'Cat__x3_N',
 'Cat__x4_Artery',
 'Cat__x4_Feedr',
 'Cat__x4_Norm',
 'Cat__x4_PosN',
 'Cat__x4_RRAe',
 'Cat__x4_RRNe',
 'Cat__x5_Artery',
 'Cat__x5_Feedr',
 'Cat__x5_PosA',
 'Cat__x5_PosN',
 'Cat__x7_BrkComm',
 'Cat__x7_BrkFace',
 'Cat__x7_CBlock',
 'Cat__x7_HdBoard',
 'Cat__x7_PreCast',
 'Cat__x7_Stucco',
 'Cat__x7_Wd Sdng',
 'Cat__x8_AsbShng',
 'Cat__x8_AsphShn',
 'Cat__x8_CBlock',
 'Cat__x8_CmentBd',
 'Cat__x8_ImStucc',
 'Cat__x8_Plywood',
 'Cat__x8_PreCast',
 'Cat__x8_VinylSd',
 'Cat__x8_Wd Shng',
 'Cat__x9_GdPrv',
 'Cat__x9_GdWo',
 'Cat__x9_MnWw',
 'Cat__x10_BrkTil',
 'Cat__x10_PConc',
 'Cat__x11_Maj1',
 'Cat__x11_Maj2',
 'Cat__x11_Mod',
 'Cat__x11_Typ',
 'Cat__x12_2Types',
 'Cat__x12_Attchd',
 'Cat__x12_Basment',
 'Cat__x12_CarPort',
 'Cat__x13_Floor',
 'Cat__x13_GasW',
 'Cat__x13_Grav',
 'Cat__x13_Wall',
 'Cat__x14_1.5

In [None]:
selector = SelectFromModel(estimator=lasso)
X = selector.fit_transform(X, y)

In [None]:
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), cats)], remainder='passthrough')
X = transformer.fit_transform(X)
X = scaler.fit_transform(X)
y = train['return_flag']

clf.set_params(penalty='l1', C=0.01, random_state=42, solver='saga', max_iter=10000)
selector = SelectFromModel(estimator=clf)
X = selector.fit_transform(X, y)

clf = LogisticRegression()
clf.fit(X,y)
clf.score(X,y)