# **MLR log-linear 15/ model:**

log-linear model with 15 numeric feautures reduced by SequentialFeatureSelector with **tol=None**, combined with the top categorical features scored against them.  

In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Import raw data, if needed for comparison.
ames_raw = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0).reset_index()
ames_raw.drop(columns=['index'], inplace=True)
ames_raw.head(2)

# Import cleaned data for modeling
ames = pd.read_pickle('ames_clean.pkl')

In [3]:
# define numerical features list as determined in Ames_MLR_LogLinear_sfs.ipynb
# with sfs features selected by sfs with tol=None
sfs_numerical_features_list = ['GrLivArea', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'LowQualFinSF',
       'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars',
       'GarageArea', 'ScreenPorch']

# define numerical features
sfs_numerical_features = ames[sfs_numerical_features_list]

# print number of numerical features and head of df
print(f'number of sfs-reduced numerical features (without target): {len(sfs_numerical_features_list)}')
sfs_numerical_features.head(2)

number of sfs-reduced numerical features (without target): 15


Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,Fireplaces,GarageCars,GarageArea,ScreenPorch
0,856,68.0,7890,1939,1950,238.0,618.0,856.0,0,2,1,1,2.0,399.0,166
1,1049,42.0,4235,1984,1984,552.0,104.0,1049.0,0,2,1,0,1.0,266.0,0


In [4]:
# define categorical features
categorical_features = ames.select_dtypes(include=['object'])

# define categorical features list
categorical_features_list = categorical_features.columns

# print number of categorical features and head of df
print(f'number of categorical features: {len(categorical_features_list)}')
categorical_features.head(2)

number of categorical features: 48


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,...,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,TA,TA,Y,,,,2,2009,WD,Normal


### Score categorical features individually

Score the sfs-reduced numeric feature set with each categorical feature, individually, to see which ones are the most predictive

In [6]:
# define features for modeling
X = sfs_numerical_features
y = ames['SalePrice']
X.head(2)

Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,Fireplaces,GarageCars,GarageArea,ScreenPorch
0,856,68.0,7890,1939,1950,238.0,618.0,856.0,0,2,1,1,2.0,399.0,166
1,1049,42.0,4235,1984,1984,552.0,104.0,1049.0,0,2,1,0,1.0,266.0,0


In [7]:
# Baseline score for sfs-reduced numeric features, no categorical features
baseline_score = cross_val_score(LinearRegression(), X, np.log10(y), cv=5).mean()
baseline_score

0.8475673183327672

In [8]:
categorical_improvements = {}
for cat_feature in categorical_features_list:
    # Test numeric feature set + this categorical
    test_features = sfs_numerical_features_list + [cat_feature]
    
    # preprocess this combination
    preprocessor = ColumnTransformer([
        ('num', 'passthrough', sfs_numerical_features_list),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), [cat_feature])
    ])
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    score = cross_val_score(model, ames[test_features], y, cv=5).mean()
    improvement = score - baseline_score
    
    categorical_improvements[cat_feature] = improvement
    print(f"{cat_feature}: {score:.4f} (+{improvement:.4f})")

MSSubClass: 0.8265 (+-0.0211)
MSZoning: 0.8224 (+-0.0252)
Street: 0.8226 (+-0.0250)
Alley: 0.8226 (+-0.0249)
LotShape: 0.8236 (+-0.0239)
LandContour: 0.8265 (+-0.0210)
Utilities: 0.8227 (+-0.0249)
LotConfig: 0.8232 (+-0.0244)




LandSlope: 0.8226 (+-0.0250)
Neighborhood: 0.8545 (+0.0069)
Condition1: 0.8230 (+-0.0245)
Condition2: 0.8175 (+-0.0300)
BldgType: 0.8243 (+-0.0232)
HouseStyle: 0.8213 (+-0.0263)
OverallQual: 0.8850 (+0.0374)
OverallCond: 0.8299 (+-0.0177)
RoofStyle: 0.8241 (+-0.0235)
RoofMatl: 0.8249 (+-0.0227)
Exterior1st: 0.8267 (+-0.0209)
Exterior2nd: 0.8251 (+-0.0224)




MasVnrType: 0.8238 (+-0.0238)
ExterQual: 0.8532 (+0.0056)
ExterCond: 0.8226 (+-0.0250)
Foundation: 0.8275 (+-0.0201)
BsmtQual: 0.8487 (+0.0011)
BsmtCond: 0.8245 (+-0.0231)
BsmtExposure: 0.8310 (+-0.0166)
BsmtFinType1: 0.8282 (+-0.0193)
BsmtFinType2: 0.8258 (+-0.0218)
Heating: 0.8237 (+-0.0238)
HeatingQC: 0.8264 (+-0.0212)




CentralAir: 0.8229 (+-0.0247)
Electrical: 0.8229 (+-0.0246)
KitchenQual: 0.8509 (+0.0033)
Functional: 0.8305 (+-0.0171)
FireplaceQu: 0.8286 (+-0.0189)
GarageType: 0.8259 (+-0.0217)
GarageFinish: 0.8261 (+-0.0214)
GarageQual: 0.8244 (+-0.0232)
GarageCond: 0.8243 (+-0.0232)
PavedDrive: 0.8223 (+-0.0252)
PoolQC: 0.8200 (+-0.0275)




Fence: 0.8226 (+-0.0250)
MiscFeature: 0.8228 (+-0.0248)
MoSold: 0.8225 (+-0.0251)
YrSold: 0.8225 (+-0.0251)
SaleType: 0.8235 (+-0.0241)
SaleCondition: 0.8253 (+-0.0223)




In [9]:
# sort improvements
sorted_cats = sorted(categorical_improvements.items(), key=lambda x: x[1], reverse=True)
print(f"\nBest categorical features to add to sfs-reduced numeric features:")
for cat, improvement in sorted_cats[:15]:
    print(f"{cat}: +{improvement:.4f}")


Best categorical features to add to sfs-reduced numeric features:
OverallQual: +0.0374
Neighborhood: +0.0069
ExterQual: +0.0056
KitchenQual: +0.0033
BsmtQual: +0.0011
BsmtExposure: +-0.0166
Functional: +-0.0171
OverallCond: +-0.0177
FireplaceQu: +-0.0189
BsmtFinType1: +-0.0193
Foundation: +-0.0201
Exterior1st: +-0.0209
LandContour: +-0.0210
MSSubClass: +-0.0211
HeatingQC: +-0.0212


In [10]:
# define list of most predictive categorical features
best_cat_list = [
    'OverallQual',
    'Neighborhood',
    'ExterQual',
    'KitchenQual',
    'BsmtQual',
    'BsmtExposure',
    'Functional',
    'OverallCond',
    'FireplaceQu',
    'BsmtFinType1',
    'Foundation',
    'Exterior1st',
    'LandContour',
    'MSSubClass',
    'HeatingQC'
]

In [11]:
best_cat_df = ames[best_cat_list]
best_cat_df.head(2)

Unnamed: 0,OverallQual,Neighborhood,ExterQual,KitchenQual,BsmtQual,BsmtExposure,Functional,OverallCond,FireplaceQu,BsmtFinType1,Foundation,Exterior1st,LandContour,MSSubClass,HeatingQC
0,6,SWISU,TA,TA,TA,No,Typ,6,Gd,Rec,CBlock,Wd Sdng,Lvl,30,TA
1,5,Edwards,Gd,Gd,Gd,Mn,Typ,5,,GLQ,CBlock,HdBoard,Lvl,120,TA


In [12]:
# create dataframe with sfs-reduced numerical features and these best categorical features
X_reduced = sfs_numerical_features.join(best_cat_df)
X_reduced.head(2)

Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,LowQualFinSF,BedroomAbvGr,...,BsmtExposure,Functional,OverallCond,FireplaceQu,BsmtFinType1,Foundation,Exterior1st,LandContour,MSSubClass,HeatingQC
0,856,68.0,7890,1939,1950,238.0,618.0,856.0,0,2,...,No,Typ,6,Gd,Rec,CBlock,Wd Sdng,Lvl,30,TA
1,1049,42.0,4235,1984,1984,552.0,104.0,1049.0,0,2,...,Mn,Typ,5,,GLQ,CBlock,HdBoard,Lvl,120,TA


In [13]:
# define target
y = ames['SalePrice']

In [14]:
# define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', 'passthrough', sfs_numerical_features_list),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), best_cat_list)
])

In [15]:
# fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

In [16]:
# create storage
results = {}

# fit and evaluate the model on all data and score
model.fit(X_reduced, np.log10(y))
cv_scores = cross_val_score(model, X_reduced, np.log10(y))
mean_cv_score = float(round(cv_scores.mean(), 4))

# evaluate on train-test for overfitting analysis
X_train, X_test, y_train, y_test = train_test_split(X_reduced, np.log10(y), test_size=0.2, random_state = 0)

model.fit(X_train, y_train)
train_score = (cross_val_score(model, X_train, y_train)).mean()
test_score = (cross_val_score(model, X_test, y_test)).mean()
diff = train_score - test_score

results = {
    'mean-cv on X and y': round(mean_cv_score, 4),
    'train score': round(train_score, 4),
    'test score': round(test_score, 4),
    'train-test gap': round(diff, 4),
    'scenario': f'log-linear 15/{len(best_cat_df.columns)} model:',
    'algorithm': 'MLR'
}

print(f'cv scores', cv_scores)



cv scores [0.87797229 0.91547341 0.92708971 0.93113338 0.92641925]




In [17]:
results

{'mean-cv on X and y': 0.9156,
 'train score': 0.9133,
 'test score': 0.8069,
 'train-test gap': 0.1064,
 'scenario': 'log-linear 15/15 model:',
 'algorithm': 'MLR'}

mean cv scores for sfs selected features with tol=None

- 15/7  : 0.901
- 15/10 : 0.9129
- 15/12 : 0.9153
- 15/15 : 0.9156
- 15/20 : 0.9153
- 15/25 : 0.9148

In [20]:
results_df = pd.DataFrame([results])

In [21]:
# double check this, if changed ratio of numerical to categorical features
results_df.to_csv('log_linear_15_15.csv')

# Resources
[Return To Top](#Contents)

**Dean De Cock paper and original data:**

- [Ames, Iowa: Alternative to the Boston Housing Data as an
End of Semester Regression Project](https://jse.amstat.org/v19n3/decock.pdf)

- [DataDocumentation.txt](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)

- [Ames Data Dictionary on Github](https://github.com/Padre-Media/dataset/blob/main/Ames%20Data%20Dictionary.txt)