# **MLR linear-linear 15/cat model:**

linear-linear model with 15 numeric feautures reduced by SequentialFeatureSelector with **tol=None**, combined with the top categorical features scored against them.  

In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Import raw data, if needed for comparison.
ames_raw = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0).reset_index()
ames_raw.drop(columns=['index'], inplace=True)
ames_raw.head(2)

# Import cleaned data for modeling
ames = pd.read_pickle('ames_clean.pkl')

In [3]:
# define numerical features list as determined in Ames_MLR_LinearLinear_sfs.ipynb
# features selected by sfs with tol=None
sfs_numerical_features_list = ['GrLivArea', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea',
       'WoodDeckSF', 'ScreenPorch']

# define numerical features
sfs_numerical_features = ames[sfs_numerical_features_list]

# print number of numerical features and head of df
print(f'number of sfs-reduced numerical features (without target): {len(sfs_numerical_features_list)}')
sfs_numerical_features.head(2)

number of sfs-reduced numerical features (without target): 15


Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,WoodDeckSF,ScreenPorch
0,856,68.0,7890,1939,1950,0.0,238.0,856.0,2,1,4,1,399.0,0,166
1,1049,42.0,4235,1984,1984,149.0,552.0,1049.0,2,1,5,0,266.0,0,0


In [4]:
# define categorical features
categorical_features = ames.select_dtypes(include=['object'])

# define categorical features list
categorical_features_list = categorical_features.columns

# print number of categorical features and head of df
print(f'number of categorical features: {len(categorical_features_list)}')
categorical_features.head(2)

number of categorical features: 48


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,...,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,TA,TA,Y,,,,2,2009,WD,Normal


### Score categorical features individually

Score the sfs-reduced numeric feature set with each categorical feature, individually, to see which ones are the most predictive

In [6]:
# define features for modeling
X = sfs_numerical_features
y = ames['SalePrice']
X.head(2)

Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,WoodDeckSF,ScreenPorch
0,856,68.0,7890,1939,1950,0.0,238.0,856.0,2,1,4,1,399.0,0,166
1,1049,42.0,4235,1984,1984,149.0,552.0,1049.0,2,1,5,0,266.0,0,0


In [7]:
# Baseline score for sfs-reduced numeric features, no categorical features
baseline_score = cross_val_score(LinearRegression(), X, y, cv=5).mean()
baseline_score

0.829458253090675

In [8]:
categorical_improvements = {}
for cat_feature in categorical_features_list:
    # Test numeric feature set + this categorical
    test_features = sfs_numerical_features_list + [cat_feature]
    
    # preprocess this combination
    preprocessor = ColumnTransformer([
        ('num', 'passthrough', sfs_numerical_features_list),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), [cat_feature])
    ])
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    score = cross_val_score(model, ames[test_features], y, cv=5).mean()
    improvement = score - baseline_score
    
    categorical_improvements[cat_feature] = improvement
    print(f"{cat_feature}: {score:.4f} (+{improvement:.4f})")

MSSubClass: 0.8349 (+0.0054)
MSZoning: 0.8293 (+-0.0001)




Street: 0.8294 (+-0.0001)
Alley: 0.8294 (+-0.0001)
LotShape: 0.8303 (+0.0008)
LandContour: 0.8333 (+0.0039)
Utilities: 0.8295 (+-0.0000)
LotConfig: 0.8301 (+0.0007)
LandSlope: 0.8293 (+-0.0001)
Neighborhood: 0.8573 (+0.0279)
Condition1: 0.8296 (+0.0002)




Condition2: 0.8247 (+-0.0048)
BldgType: 0.8318 (+0.0023)
HouseStyle: 0.8285 (+-0.0010)
OverallQual: 0.8847 (+0.0553)
OverallCond: 0.8369 (+0.0074)
RoofStyle: 0.8296 (+0.0001)
RoofMatl: 0.8314 (+0.0019)
Exterior1st: 0.8344 (+0.0050)
Exterior2nd: 0.8326 (+0.0031)




MasVnrType: 0.8333 (+0.0038)
ExterQual: 0.8559 (+0.0264)
ExterCond: 0.8294 (+-0.0001)
Foundation: 0.8342 (+0.0048)
BsmtQual: 0.8519 (+0.0225)
BsmtCond: 0.8311 (+0.0017)
BsmtExposure: 0.8370 (+0.0075)
BsmtFinType1: 0.8340 (+0.0045)
BsmtFinType2: 0.8321 (+0.0026)
Heating: 0.8301 (+0.0006)
HeatingQC: 0.8330 (+0.0035)




CentralAir: 0.8296 (+0.0001)
Electrical: 0.8297 (+0.0002)
KitchenQual: 0.8545 (+0.0250)
Functional: 0.8368 (+0.0073)
FireplaceQu: 0.8345 (+0.0051)
GarageType: 0.8315 (+0.0021)
GarageFinish: 0.8324 (+0.0030)
GarageQual: 0.8304 (+0.0010)
GarageCond: 0.8302 (+0.0007)
PavedDrive: 0.8292 (+-0.0003)
PoolQC: 0.8274 (+-0.0021)
Fence: 0.8294 (+-0.0001)
MiscFeature: 0.8296 (+0.0001)
MoSold: 0.8291 (+-0.0003)
YrSold: 0.8291 (+-0.0003)
SaleType: 0.8298 (+0.0004)
SaleCondition: 0.8318 (+0.0024)




In [9]:
# sort improvements
sorted_cats = sorted(categorical_improvements.items(), key=lambda x: x[1], reverse=True)
print(f"\nBest categorical features to add to sfs-reduced numeric features:")
for cat, improvement in sorted_cats[:30]:
    print(f"{cat}: +{improvement:.4f}")


Best categorical features to add to sfs-reduced numeric features:
OverallQual: +0.0553
Neighborhood: +0.0279
ExterQual: +0.0264
KitchenQual: +0.0250
BsmtQual: +0.0225
BsmtExposure: +0.0075
OverallCond: +0.0074
Functional: +0.0073
MSSubClass: +0.0054
FireplaceQu: +0.0051
Exterior1st: +0.0050
Foundation: +0.0048
BsmtFinType1: +0.0045
LandContour: +0.0039
MasVnrType: +0.0038
HeatingQC: +0.0035
Exterior2nd: +0.0031
GarageFinish: +0.0030
BsmtFinType2: +0.0026
SaleCondition: +0.0024
BldgType: +0.0023
GarageType: +0.0021
RoofMatl: +0.0019
BsmtCond: +0.0017
GarageQual: +0.0010
LotShape: +0.0008
GarageCond: +0.0007
LotConfig: +0.0007
Heating: +0.0006
SaleType: +0.0004


In [10]:
# define list of most predictive categorical features
best_cat_list = [
    'OverallQual',
    'Neighborhood',
    'ExterQual',
    'KitchenQual',
    'BsmtQual',
    'BsmtExposure',
    'OverallCond',
    'Functional',
    'MSSubClass',
    'FireplaceQu',
    'Exterior1st',
    'Foundation',
    # 'BsmtFinType1', 
    # 'LandContour',
    # 'MasVnrType',
    # 'HeatingQC',
    # 'Exterior2nd',
    # 'GarageFinish',
    # 'BsmtFinType2', 
    # 'SaleCondition', 
]

In [11]:
best_cat_df = ames[best_cat_list]
best_cat_df.head(2)

Unnamed: 0,OverallQual,Neighborhood,ExterQual,KitchenQual,BsmtQual,BsmtExposure,OverallCond,Functional,MSSubClass,FireplaceQu,Exterior1st,Foundation
0,6,SWISU,TA,TA,TA,No,6,Typ,30,Gd,Wd Sdng,CBlock
1,5,Edwards,Gd,Gd,Gd,Mn,5,Typ,120,,HdBoard,CBlock


In [12]:
# create dataframe with sfs-reduced numerical features and these best categorical features
X_reduced = sfs_numerical_features.join(best_cat_df)
X_reduced.head(2)

Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,BedroomAbvGr,KitchenAbvGr,...,ExterQual,KitchenQual,BsmtQual,BsmtExposure,OverallCond,Functional,MSSubClass,FireplaceQu,Exterior1st,Foundation
0,856,68.0,7890,1939,1950,0.0,238.0,856.0,2,1,...,TA,TA,TA,No,6,Typ,30,Gd,Wd Sdng,CBlock
1,1049,42.0,4235,1984,1984,149.0,552.0,1049.0,2,1,...,Gd,Gd,Gd,Mn,5,Typ,120,,HdBoard,CBlock


In [13]:
# define target
y = ames['SalePrice']

In [14]:
# define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', 'passthrough', sfs_numerical_features_list),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), best_cat_list)
])

In [15]:
# fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

In [16]:
# create storage
results = {}

# fit and evaluate the model on all data and score
model.fit(X_reduced, y)
cv_scores = cross_val_score(model, X_reduced, y)
mean_cv_score = float(round(cv_scores.mean(), 4))

# evaluate on train-test for overfitting analysis
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state = 0)

model.fit(X_train, y_train)
train_score = (cross_val_score(model, X_train, y_train)).mean()
test_score = (cross_val_score(model, X_test, y_test)).mean()
diff = train_score - test_score

results = {
    'mean-cv on X and y': round(mean_cv_score, 4),
    'train score': round(train_score, 4),
    'test score': round(test_score, 4),
    'train-test gap': round(diff, 4),
    'scenario': f'linear-linear 15/{len(best_cat_df.columns)} model:',
    'algorithm': 'MLR'
}

print(f'cv scores', cv_scores)



cv scores [0.8694846  0.91694176 0.9311637  0.92986919 0.92299942]


In [17]:
results

{'mean-cv on X and y': 0.9141,
 'train score': 0.8949,
 'test score': 0.915,
 'train-test gap': -0.02,
 'scenario': 'linear-linear 15/12 model:',
 'algorithm': 'MLR'}

for sfs selected features with tol=None
- 15/7:  0.9083
- 15/10: 0.9096
- 15/12: 0.9141
- 15/15: 0.9141
- 15/20: 0.9146
- 15/25: 0.9145
- 15/30: 0.9131

In [19]:
best_cat_list

['OverallQual',
 'Neighborhood',
 'ExterQual',
 'KitchenQual',
 'BsmtQual',
 'BsmtExposure',
 'OverallCond',
 'Functional',
 'MSSubClass',
 'FireplaceQu',
 'Exterior1st',
 'Foundation']

In [20]:
sfs_numerical_features_list

['GrLivArea',
 'LotFrontage',
 'LotArea',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'TotalBsmtSF',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageArea',
 'WoodDeckSF',
 'ScreenPorch']

In [21]:
results_df = pd.DataFrame([results])

In [22]:
# double check this, if changed ratio of numerical to categorical features
results_df.to_csv('linear_linear_15_12.csv')

# Resources
[Return To Top](#Contents)

**Dean De Cock paper and original data:**

- [Ames, Iowa: Alternative to the Boston Housing Data as an
End of Semester Regression Project](https://jse.amstat.org/v19n3/decock.pdf)

- [DataDocumentation.txt](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)

- [Ames Data Dictionary on Github](https://github.com/Padre-Media/dataset/blob/main/Ames%20Data%20Dictionary.txt)