# **MLR log-linear 8/cat model:**

log-linear model with 8 numeric feautures reduced by SequentialFeatureSelector with **tol=0.005**, combined with the top categorical features scored against them.  

In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Import raw data, if needed for comparison.
ames_raw = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0).reset_index()
ames_raw.drop(columns=['index'], inplace=True)
ames_raw.head(2)

# Import cleaned data for modeling
ames = pd.read_pickle('ames_clean.pkl')

In [3]:
# define numerical features list as determined in Ames_MLR_LogLinear_sfs.ipynb
# with default sfs tol=None
# sfs_numerical_features_list = ['GrLivArea', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
#        'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'LowQualFinSF',
#        'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars',
#        'GarageArea', 'ScreenPorch']

# with sfs tol=0.005
sfs_numerical_features_list = ['GrLivArea', 'YearBuilt', 'YearRemodAdd', 'BsmtUnfSF',
       'TotalBsmtSF', 'KitchenAbvGr', 'Fireplaces', 'GarageArea']

# define numerical features
sfs_numerical_features = ames[sfs_numerical_features_list]

# print number of numerical features and head of df
print(f'number of sfs-reduced numerical features (without target): {len(sfs_numerical_features_list)}')
sfs_numerical_features.head(2)

number of sfs-reduced numerical features (without target): 8


Unnamed: 0,GrLivArea,YearBuilt,YearRemodAdd,BsmtUnfSF,TotalBsmtSF,KitchenAbvGr,Fireplaces,GarageArea
0,856,1939,1950,618.0,856.0,1,1,399.0
1,1049,1984,1984,104.0,1049.0,1,0,266.0


In [4]:
# define categorical features
categorical_features = ames.select_dtypes(include=['object'])

# define categorical features list
categorical_features_list = categorical_features.columns

# print number of categorical features and head of df
print(f'number of categorical features: {len(categorical_features_list)}')
categorical_features.head(2)

number of categorical features: 48


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,...,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,TA,TA,Y,,,,2,2009,WD,Normal


### Score categorical features individually

Score the sfs-reduced numeric feature set with each categorical feature, individually, to see which ones are the most predictive

In [6]:
# define features for modeling
X = sfs_numerical_features
y = ames['SalePrice']
X.head(2)

Unnamed: 0,GrLivArea,YearBuilt,YearRemodAdd,BsmtUnfSF,TotalBsmtSF,KitchenAbvGr,Fireplaces,GarageArea
0,856,1939,1950,618.0,856.0,1,1,399.0
1,1049,1984,1984,104.0,1049.0,1,0,266.0


In [7]:
# Baseline score for sfs-reduced numeric features, no categorical features
baseline_score = cross_val_score(LinearRegression(), X, y, cv=5).mean()
baseline_score

# with tol=None: 0.8227024972249994

0.808555129023512

In [8]:
categorical_improvements = {}
for cat_feature in categorical_features_list:
    # Test numeric feature set + this categorical
    test_features = sfs_numerical_features_list + [cat_feature]
    
    # preprocess this combination
    preprocessor = ColumnTransformer([
        ('num', 'passthrough', sfs_numerical_features_list),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), [cat_feature])
    ])
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    score = cross_val_score(model, ames[test_features], y, cv=5).mean()
    improvement = score - baseline_score
    
    categorical_improvements[cat_feature] = improvement
    print(f"{cat_feature}: {score:.4f} (+{improvement:.4f})")

MSSubClass: 0.8159 (+0.0073)
MSZoning: 0.8082 (+-0.0004)
Street: 0.8084 (+-0.0002)
Alley: 0.8085 (+-0.0001)
LotShape: 0.8093 (+0.0007)
LandContour: 0.8144 (+0.0058)
Utilities: 0.8086 (+0.0000)
LotConfig: 0.8091 (+0.0006)




LandSlope: 0.8088 (+0.0002)
Neighborhood: 0.8471 (+0.0386)
Condition1: 0.8095 (+0.0009)
Condition2: 0.8046 (+-0.0039)
BldgType: 0.8105 (+0.0019)
HouseStyle: 0.8079 (+-0.0006)
OverallQual: 0.8780 (+0.0694)
OverallCond: 0.8146 (+0.0061)
RoofStyle: 0.8096 (+0.0011)
RoofMatl: 0.8117 (+0.0031)
Exterior1st: 0.8141 (+0.0056)




Exterior2nd: 0.8127 (+0.0041)
MasVnrType: 0.8098 (+0.0012)
ExterQual: 0.8442 (+0.0356)
ExterCond: 0.8079 (+-0.0007)
Foundation: 0.8158 (+0.0072)
BsmtQual: 0.8415 (+0.0329)
BsmtCond: 0.8128 (+0.0042)
BsmtExposure: 0.8223 (+0.0137)
BsmtFinType1: 0.8194 (+0.0109)
BsmtFinType2: 0.8155 (+0.0070)
Heating: 0.8099 (+0.0013)
HeatingQC: 0.8131 (+0.0045)
CentralAir: 0.8093 (+0.0008)
Electrical: 0.8092 (+0.0007)
KitchenQual: 0.8417 (+0.0331)




Functional: 0.8157 (+0.0071)
FireplaceQu: 0.8163 (+0.0077)
GarageType: 0.8108 (+0.0022)
GarageFinish: 0.8127 (+0.0041)
GarageQual: 0.8105 (+0.0019)
GarageCond: 0.8100 (+0.0014)
PavedDrive: 0.8089 (+0.0003)
PoolQC: 0.8073 (+-0.0013)
Fence: 0.8088 (+0.0002)
MiscFeature: 0.8084 (+-0.0002)
MoSold: 0.8082 (+-0.0004)
YrSold: 0.8084 (+-0.0001)
SaleType: 0.8087 (+0.0001)
SaleCondition: 0.8125 (+0.0039)




In [9]:
# sort improvements
sorted_cats = sorted(categorical_improvements.items(), key=lambda x: x[1], reverse=True)
print(f"\nBest categorical features to add to sfs-reduced numeric features:")
for cat, improvement in sorted_cats[:25]:
    print(f"{cat}: +{improvement:.4f}")


Best categorical features to add to sfs-reduced numeric features:
OverallQual: +0.0694
Neighborhood: +0.0386
ExterQual: +0.0356
KitchenQual: +0.0331
BsmtQual: +0.0329
BsmtExposure: +0.0137
BsmtFinType1: +0.0109
FireplaceQu: +0.0077
MSSubClass: +0.0073
Foundation: +0.0072
Functional: +0.0071
BsmtFinType2: +0.0070
OverallCond: +0.0061
LandContour: +0.0058
Exterior1st: +0.0056
HeatingQC: +0.0045
BsmtCond: +0.0042
GarageFinish: +0.0041
Exterior2nd: +0.0041
SaleCondition: +0.0039
RoofMatl: +0.0031
GarageType: +0.0022
GarageQual: +0.0019
BldgType: +0.0019
GarageCond: +0.0014


In [10]:
# define list of most predictive categorical features
best_cat_list = [
    'OverallQual',
    'Neighborhood',
    'ExterQual',
    'KitchenQual',
    'BsmtQual',
    'BsmtExposure',
    'BsmtFinType1',
    'FireplaceQu',
    'MSSubClass',
    'Foundation',
    'Functional',
    'BsmtFinType2',
    'OverallCond',
    'LandContour',
    'Exterior1st',
    'HeatingQC',
    'BsmtCond',
    # 'GarageFinish',
    # 'Exterior2nd',
     'SaleCondition',
    # 'RoofMatl',
    # 'GarageType',
    # 'GarageQual',
    # 'BldgType',
    # 'GarageCond'
]

In [11]:
best_cat_df = ames[best_cat_list]
best_cat_df.head(2)

Unnamed: 0,OverallQual,Neighborhood,ExterQual,KitchenQual,BsmtQual,BsmtExposure,BsmtFinType1,FireplaceQu,MSSubClass,Foundation,Functional,BsmtFinType2,OverallCond,LandContour,Exterior1st,HeatingQC,BsmtCond,SaleCondition
0,6,SWISU,TA,TA,TA,No,Rec,Gd,30,CBlock,Typ,Unf,6,Lvl,Wd Sdng,TA,TA,Normal
1,5,Edwards,Gd,Gd,Gd,Mn,GLQ,,120,CBlock,Typ,ALQ,5,Lvl,HdBoard,TA,TA,Normal


In [12]:
# create dataframe with sfs-reduced numerical features and these best categorical features
X_reduced = sfs_numerical_features.join(best_cat_df)
X_reduced.head(2)

Unnamed: 0,GrLivArea,YearBuilt,YearRemodAdd,BsmtUnfSF,TotalBsmtSF,KitchenAbvGr,Fireplaces,GarageArea,OverallQual,Neighborhood,...,MSSubClass,Foundation,Functional,BsmtFinType2,OverallCond,LandContour,Exterior1st,HeatingQC,BsmtCond,SaleCondition
0,856,1939,1950,618.0,856.0,1,1,399.0,6,SWISU,...,30,CBlock,Typ,Unf,6,Lvl,Wd Sdng,TA,TA,Normal
1,1049,1984,1984,104.0,1049.0,1,0,266.0,5,Edwards,...,120,CBlock,Typ,ALQ,5,Lvl,HdBoard,TA,TA,Normal


In [13]:
# define target
y = ames['SalePrice']

In [14]:
# define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', 'passthrough', sfs_numerical_features_list),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), best_cat_list)
])

In [15]:
# fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

In [16]:
# create storage
results = {}

# fit and evaluate the model on all data and score
model.fit(X_reduced, np.log10(y))
cv_scores = cross_val_score(model, X_reduced, np.log10(y))
mean_cv_score = float(round(cv_scores.mean(), 4))

# evaluate on train-test for overfitting analysis
X_train, X_test, y_train, y_test = train_test_split(X_reduced, np.log10(y), test_size=0.2, random_state = 0)

model.fit(X_train, y_train)
train_score = (cross_val_score(model, X_train, y_train)).mean()
test_score = (cross_val_score(model, X_test, y_test)).mean()
diff = train_score - test_score

results = {
    'mean-cv on X and y': round(mean_cv_score, 4),
    'train score': round(train_score, 4),
    'test score': round(test_score, 4),
    'train-test gap': round(diff, 4),
    'scenario': f'log-linear 8/{len(best_cat_df.columns)} model:',
    'algorithm': 'MLR'
}

print(f'cv scores', cv_scores)



cv scores [0.88020979 0.91697339 0.9250048  0.926936   0.92584238]




In [17]:
results

{'mean-cv on X and y': 0.915,
 'train score': 0.9126,
 'test score': 0.8293,
 'train-test gap': 0.0834,
 'scenario': 'log-linear 8/18 model:',
 'algorithm': 'MLR'}

**Notes:** mean cv scores for sfs selected features with tol=0.005
- 8/10
- 8/15: 0.9123
- 8/17: 0.913
- 8/18: 0.913 (0.915 with SaleCondition instead of BsmtCond)
- 8/19: 0.9113
- 8/20: 0.9134
- 8/21: 0.913
- 8/22: 0.9128
- 8/25: 0.913

mean cv scores for sfs selected features with tol=None
- 15/7  : 0.901
- 15/10 : 0.9129
- 15/12 : 0.9153
- 15/15 : 0.9156
- 15/20 : 0.9153
- 15/25 : 0.9148

In [19]:
results_df = pd.DataFrame([results])

In [20]:
# double check this, if changed ratio of numerical to categorical features
results_df.to_csv('log_linear_8_18.csv')

# Resources
[Return To Top](#Contents)

**Dean De Cock paper and original data:**

- [Ames, Iowa: Alternative to the Boston Housing Data as an
End of Semester Regression Project](https://jse.amstat.org/v19n3/decock.pdf)

- [DataDocumentation.txt](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)

- [Ames Data Dictionary on Github](https://github.com/Padre-Media/dataset/blob/main/Ames%20Data%20Dictionary.txt)