# **MLR log-linear sfs model:**

log-linear model with numeric feautures reduced by SequentialFeatureSelector  

In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Import raw data, if needed for comparison.
ames_raw = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0).reset_index()
ames_raw.drop(columns=['index'], inplace=True)
ames_raw.head(2)

# Import cleaned data for modeling
ames = pd.read_pickle('ames_clean.pkl')

In [3]:
# define numerical features
numerical_features = ames.select_dtypes(include=['float64', 'int64'])

# drop target from features
numerical_features.drop(columns=['SalePrice'], axis=1, inplace=True)

# define numerical features list
numerical_features_list = numerical_features.columns

# print number of numerical features and head of df
print(f'number of numerical features (without target): {len(numerical_features_list)}')
numerical_features.head(2)

number of numerical features (without target): 31


Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,856,68.0,7890,1939,1950,0.0,238.0,0.0,618.0,856.0,...,1939,2.0,399.0,0,0,0,0,166,0,0
1,1049,42.0,4235,1984,1984,149.0,552.0,393.0,104.0,1049.0,...,1984,1.0,266.0,0,105,0,0,0,0,0


In [4]:
# define categorical features
categorical_features = ames.select_dtypes(include=['object'])

# define categorical features list
categorical_features_list = categorical_features.columns

# print number of categorical features and head of df
print(f'number of categorical features: {len(categorical_features_list)}')
categorical_features.head(2)

number of categorical features: 48


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,...,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,TA,TA,Y,,,,2,2009,WD,Normal


In [5]:
# define features for modeling
X = numerical_features.join(categorical_features)
y = ames['SalePrice']
X.head(2)

Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,856,68.0,7890,1939,1950,0.0,238.0,0.0,618.0,856.0,...,TA,TA,Y,,,,3,2010,WD,Normal
1,1049,42.0,4235,1984,1984,149.0,552.0,393.0,104.0,1049.0,...,TA,TA,Y,,,,2,2009,WD,Normal


### set tolerance
Set the tolerance of the sfs to stop feature selection once the model has ceased to improve significantly.

Double check this: *(Running the default tolerance = None returned 15 features)*

In [7]:
# define transformer to apply sfs to numeric features
num_transformer = Pipeline(steps=[
    ('sfs', SequentialFeatureSelector(
        estimator = LinearRegression(),
        n_features_to_select = 'auto',
        direction = 'forward',
        cv = 5,
        scoring = 'r2',
        tol = 0.005
    ))
])

In [8]:
# define transformer for categorical features
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

In [9]:
# define combined preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', num_transformer, numerical_features_list),
        ('onehot', cat_transformer, categorical_features_list)
])

In [10]:
# fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

In [11]:
# create storage
results = {}

# fit and evaluate the model on all data and score
model.fit(X, np.log(y))
cv_scores = cross_val_score(model, X, np.log(y))
mean_cv_score = float(round(cv_scores.mean(), 4))

# evaluate on train-test for overfitting analysis
X_train, X_test, y_train, y_test = train_test_split(X, np.log(y), test_size=0.2, random_state = 0)

model.fit(X_train, y_train)
train_score = (cross_val_score(model, X_train, y_train)).mean()
test_score = (cross_val_score(model, X_test, y_test)).mean()
diff = train_score - test_score

results = {
    'mean-cv on X and y': round(mean_cv_score, 4),
    'train score': round(train_score, 4),
    'test score': round(test_score, 4),
    'train-test gap': round(diff, 4),
    'scenario': 'log-linear sfs-numeric and full categorical model',
    'algorithm': 'MLR'
}

print(f'cv scores', cv_scores)



cv scores [0.86838893 0.91768971 0.9090295  0.92537296 0.91082164]




In [12]:
results

{'mean-cv on X and y': 0.9063,
 'train score': 0.8974,
 'test score': 0.7656,
 'train-test gap': 0.1319,
 'scenario': 'log-linear sfs-numeric and full categorical model',
 'algorithm': 'MLR'}

In [13]:
results_df = pd.DataFrame([results])

In [14]:
# get the features selected from sfs
sfs = model.named_steps['preprocessor'].named_transformers_['numerical'].named_steps['sfs']
sfs

In [15]:
sfs.get_feature_names_out()

array(['GrLivArea', 'YearBuilt', 'YearRemodAdd', 'BsmtUnfSF',
       'TotalBsmtSF', 'KitchenAbvGr', 'Fireplaces', 'GarageArea'],
      dtype=object)

In [16]:
# define log-linear sfs reduced features: for default tolerance
sfs_features = ['GrLivArea', 'YearBuilt', 'YearRemodAdd', 'BsmtUnfSF',
       'TotalBsmtSF', 'KitchenAbvGr', 'Fireplaces', 'GarageArea']
print(f'number of numeric features kept by sfs: {len(sfs_features)}')
sfs_features

number of numeric features kept by sfs: 8


['GrLivArea',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'KitchenAbvGr',
 'Fireplaces',
 'GarageArea']

In [17]:
results_df

Unnamed: 0,mean-cv on X and y,train score,test score,train-test gap,scenario,algorithm
0,0.9063,0.8974,0.7656,0.1319,log-linear sfs-numeric and full categorical model,MLR


In [18]:
results_df.to_csv('log_linear_sfs_full.csv')

# Resources
[Return To Top](#Contents)

**Dean De Cock paper and original data:**

- [Ames, Iowa: Alternative to the Boston Housing Data as an
End of Semester Regression Project](https://jse.amstat.org/v19n3/decock.pdf)

- [DataDocumentation.txt](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)

- [Ames Data Dictionary on Github](https://github.com/Padre-Media/dataset/blob/main/Ames%20Data%20Dictionary.txt)