# **MLR linear-linear 9/cat model:**

linear-linear model with 9 numeric feautures reduced by SequentialFeatureSelector with **tol=0.005**, combined with top categorical features scored against them.  

In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Import raw data, if needed for comparison.
ames_raw = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0).reset_index()
ames_raw.drop(columns=['index'], inplace=True)
ames_raw.head(2)

# Import cleaned data for modeling
ames = pd.read_pickle('ames_clean.pkl')

In [3]:
# define numerical features list as determined by sfs in Ames_MLR_LinearLinear_sfs.ipynb
# # with tol=None
# sfs_numerical_features_list = ['GrLivArea', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
#        'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', 'BedroomAbvGr',
#        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea',
#        'WoodDeckSF', 'ScreenPorch']

# with tol=.005
sfs_numerical_features_list = ['GrLivArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'TotalBsmtSF', 'BedroomAbvGr', 'KitchenAbvGr',
       'GarageArea']

# define numerical features
sfs_numerical_features = ames[sfs_numerical_features_list]

# print number of numerical features and head of df
print(f'number of sfs-reduced numerical features (without target): {len(sfs_numerical_features_list)}')
sfs_numerical_features.head(2)

number of sfs-reduced numerical features (without target): 9


Unnamed: 0,GrLivArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,BedroomAbvGr,KitchenAbvGr,GarageArea
0,856,1939,1950,0.0,238.0,856.0,2,1,399.0
1,1049,1984,1984,149.0,552.0,1049.0,2,1,266.0


In [4]:
# define categorical features
categorical_features = ames.select_dtypes(include=['object'])

# define categorical features list
categorical_features_list = categorical_features.columns

# print number of categorical features and head of df
print(f'number of categorical features: {len(categorical_features_list)}')
categorical_features.head(2)

number of categorical features: 48


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,...,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,TA,TA,Y,,,,2,2009,WD,Normal


### Score categorical features individually

Score the sfs-reduced numeric feature set with each categorical feature, individually, to see which ones are the most predictive

In [6]:
# define features for modeling
X = sfs_numerical_features
y = ames['SalePrice']
X.head(2)

Unnamed: 0,GrLivArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,BedroomAbvGr,KitchenAbvGr,GarageArea
0,856,1939,1950,0.0,238.0,856.0,2,1,399.0
1,1049,1984,1984,149.0,552.0,1049.0,2,1,266.0


In [7]:
# Baseline score for sfs-reduced numeric features, no categorical features
baseline_score = cross_val_score(LinearRegression(), X, y, cv=5).mean()
baseline_score

# with tol=None: 0.829458253090675

0.820708752475823

In [8]:
categorical_improvements = {}
for cat_feature in categorical_features_list:
    # Test numeric feature set + this categorical
    test_features = sfs_numerical_features_list + [cat_feature]
    
    # preprocess this combination
    preprocessor = ColumnTransformer([
        ('num', 'passthrough', sfs_numerical_features_list),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), [cat_feature])
    ])
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    score = cross_val_score(model, ames[test_features], y, cv=5).mean()
    improvement = score - baseline_score
    
    categorical_improvements[cat_feature] = improvement
    print(f"{cat_feature}: {score:.4f} (+{improvement:.4f})")

MSSubClass: 0.8288 (+0.0080)
MSZoning: 0.8222 (+0.0015)
Street: 0.8206 (+-0.0001)
Alley: 0.8209 (+0.0002)
LotShape: 0.8218 (+0.0011)
LandContour: 0.8254 (+0.0047)




Utilities: 0.8207 (+0.0000)
LotConfig: 0.8216 (+0.0009)
LandSlope: 0.8209 (+0.0002)
Neighborhood: 0.8484 (+0.0277)
Condition1: 0.8207 (+-0.0000)
Condition2: 0.8156 (+-0.0051)
BldgType: 0.8264 (+0.0057)
HouseStyle: 0.8198 (+-0.0009)
OverallQual: 0.8752 (+0.0545)
OverallCond: 0.8282 (+0.0075)




RoofStyle: 0.8214 (+0.0007)
RoofMatl: 0.8233 (+0.0026)
Exterior1st: 0.8250 (+0.0043)
Exterior2nd: 0.8228 (+0.0021)
MasVnrType: 0.8240 (+0.0033)
ExterQual: 0.8462 (+0.0255)
ExterCond: 0.8205 (+-0.0002)
Foundation: 0.8247 (+0.0040)




BsmtQual: 0.8446 (+0.0239)
BsmtCond: 0.8238 (+0.0031)
BsmtExposure: 0.8314 (+0.0107)
BsmtFinType1: 0.8252 (+0.0045)
BsmtFinType2: 0.8240 (+0.0033)
Heating: 0.8214 (+0.0007)
HeatingQC: 0.8238 (+0.0031)
CentralAir: 0.8207 (+-0.0000)
Electrical: 0.8209 (+0.0002)
KitchenQual: 0.8459 (+0.0252)
Functional: 0.8273 (+0.0066)
FireplaceQu: 0.8280 (+0.0073)




GarageType: 0.8220 (+0.0012)
GarageFinish: 0.8242 (+0.0035)
GarageQual: 0.8211 (+0.0004)
GarageCond: 0.8212 (+0.0005)
PavedDrive: 0.8202 (+-0.0005)
PoolQC: 0.8189 (+-0.0018)
Fence: 0.8204 (+-0.0003)
MiscFeature: 0.8207 (+-0.0000)
MoSold: 0.8199 (+-0.0008)
YrSold: 0.8203 (+-0.0004)
SaleType: 0.8208 (+0.0001)
SaleCondition: 0.8222 (+0.0015)




In [9]:
# sort improvements
sorted_cats = sorted(categorical_improvements.items(), key=lambda x: x[1], reverse=True)
print(f"\nBest categorical features to add to sfs-reduced numeric features:")
for cat, improvement in sorted_cats[:25]:
    print(f"{cat}: +{improvement:.4f}")


Best categorical features to add to sfs-reduced numeric features:
OverallQual: +0.0545
Neighborhood: +0.0277
ExterQual: +0.0255
KitchenQual: +0.0252
BsmtQual: +0.0239
BsmtExposure: +0.0107
MSSubClass: +0.0080
OverallCond: +0.0075
FireplaceQu: +0.0073
Functional: +0.0066
BldgType: +0.0057
LandContour: +0.0047
BsmtFinType1: +0.0045
Exterior1st: +0.0043
Foundation: +0.0040
GarageFinish: +0.0035
MasVnrType: +0.0033
BsmtFinType2: +0.0033
BsmtCond: +0.0031
HeatingQC: +0.0031
RoofMatl: +0.0026
Exterior2nd: +0.0021
SaleCondition: +0.0015
MSZoning: +0.0015
GarageType: +0.0012


In [10]:
# define list of most predictive categorical features
best_cat_list = [
    'OverallQual',
    'Neighborhood',
    'ExterQual',
    'KitchenQual',
    'BsmtQual',
    'BsmtExposure',
    'MSSubClass',
    'OverallCond',
    'FireplaceQu',
    'Functional',
    'BldgType',
    'LandContour',
    'BsmtFinType1',
    'Exterior1st',
    'Foundation',
    'GarageFinish',
    'MasVnrType',
    'BsmtFinType2',
    'BsmtCond',
    'HeatingQC'
]

In [11]:
best_cat_df = ames[best_cat_list]
best_cat_df.head(2)

Unnamed: 0,OverallQual,Neighborhood,ExterQual,KitchenQual,BsmtQual,BsmtExposure,MSSubClass,OverallCond,FireplaceQu,Functional,BldgType,LandContour,BsmtFinType1,Exterior1st,Foundation,GarageFinish,MasVnrType,BsmtFinType2,BsmtCond,HeatingQC
0,6,SWISU,TA,TA,TA,No,30,6,Gd,Typ,1Fam,Lvl,Rec,Wd Sdng,CBlock,Unf,,Unf,TA,TA
1,5,Edwards,Gd,Gd,Gd,Mn,120,5,,Typ,TwnhsE,Lvl,GLQ,HdBoard,CBlock,Fin,BrkFace,ALQ,TA,TA


In [12]:
# create dataframe with sfs-reduced numerical features and these best categorical features
X_reduced = sfs_numerical_features.join(best_cat_df)
print(len(X_reduced.columns))
X_reduced.head(2)

29


Unnamed: 0,GrLivArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,BedroomAbvGr,KitchenAbvGr,GarageArea,OverallQual,...,BldgType,LandContour,BsmtFinType1,Exterior1st,Foundation,GarageFinish,MasVnrType,BsmtFinType2,BsmtCond,HeatingQC
0,856,1939,1950,0.0,238.0,856.0,2,1,399.0,6,...,1Fam,Lvl,Rec,Wd Sdng,CBlock,Unf,,Unf,TA,TA
1,1049,1984,1984,149.0,552.0,1049.0,2,1,266.0,5,...,TwnhsE,Lvl,GLQ,HdBoard,CBlock,Fin,BrkFace,ALQ,TA,TA


In [13]:
# define target
y = ames['SalePrice']

In [14]:
# define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', 'passthrough', sfs_numerical_features_list),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), best_cat_list)
])

In [15]:
# fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

In [16]:
# create storage
results = {}

# fit and evaluate the model on all data and score
model.fit(X_reduced, y)
cv_scores = cross_val_score(model, X_reduced, y)
mean_cv_score = float(round(cv_scores.mean(), 4))

# evaluate on train-test for overfitting analysis
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state = 0)

model.fit(X_train, y_train)
train_score = (cross_val_score(model, X_train, y_train)).mean()
test_score = (cross_val_score(model, X_test, y_test)).mean()
diff = train_score - test_score

results = {
    'mean-cv on X and y': round(mean_cv_score, 4),
    'train score': round(train_score, 4),
    'test score': round(test_score, 4),
    'train-test gap': round(diff, 4),
    'scenario': f'linear-linear 9/{len(best_cat_df.columns)} model:',
    'algorithm': 'MLR'
}

print(f'cv scores', cv_scores)



cv scores [0.86793266 0.91306653 0.93129235 0.9268025  0.92272708]




In [17]:
results

{'mean-cv on X and y': 0.9124,
 'train score': 0.8942,
 'test score': 0.902,
 'train-test gap': -0.0078,
 'scenario': 'linear-linear 9/20 model:',
 'algorithm': 'MLR'}

**Notes:**
mean cv scores for sfs features selected with tol=0.005
- 9/6:  0.8952
- 9/10: 0.9099
- 9/12: 0.9098
- 9/15: 0.9109
- 9/20: 0.9124
- 9/22: 0.9121
- 9/25: 0.9113

mean cv scores for sfs selected features with tol=None
see Ames_
- 15/7: 0.9083
- 15/10: 0.9096
- 15/12: 0.9141
- 15/15: 0.9141
- 15/20: 0.9146
- 15/25: 0.9145
- 15/30: 0.9131

In [19]:
results_df = pd.DataFrame([results])

In [20]:
# double check this, if changed ratio of numerical to categorical features
results_df.to_csv('linear_linear_9_20.csv')

# Resources
[Return To Top](#Contents)

**Dean De Cock paper and original data:**

- [Ames, Iowa: Alternative to the Boston Housing Data as an
End of Semester Regression Project](https://jse.amstat.org/v19n3/decock.pdf)

- [DataDocumentation.txt](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)

- [Ames Data Dictionary on Github](https://github.com/Padre-Media/dataset/blob/main/Ames%20Data%20Dictionary.txt)