**MLR log-linear base model:**

with all numeric and categorical features against log(SalePrice).  

In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression

# Import raw data, if needed for comparison.
ames_raw = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0).reset_index()
ames_raw.drop(columns=['index'], inplace=True)
ames_raw.head(2)

# Import cleaned data for modeling
ames = pd.read_pickle('ames_clean.pkl')

In [3]:
# define numerical features
numerical_features = ames.select_dtypes(include=['float64', 'int64'])

# drop target from features
numerical_features.drop(columns=['SalePrice'], axis=1, inplace=True)

# define numerical features list
numerical_features_list = numerical_features.columns

# print number of numerical features and head of df
print(f'number of numerical features (without target): {len(numerical_features_list)}')
numerical_features.head(2)

number of numerical features (without target): 31


Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,856,68.0,7890,1939,1950,0.0,238.0,0.0,618.0,856.0,...,1939,2.0,399.0,0,0,0,0,166,0,0
1,1049,42.0,4235,1984,1984,149.0,552.0,393.0,104.0,1049.0,...,1984,1.0,266.0,0,105,0,0,0,0,0


In [4]:
# define categorical features
categorical_features = ames.select_dtypes(include=['object'])

# define categorical features list
categorical_features_list = categorical_features.columns

# print number of categorical features and head of df
print(f'number of categorical features: {len(categorical_features_list)}')
categorical_features.head(2)

number of categorical features: 48


Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,...,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,TA,TA,Y,,,,2,2009,WD,Normal


In [5]:
# define features for modeling
X = numerical_features.join(categorical_features)
y = ames['SalePrice']
X.head(2)

Unnamed: 0,GrLivArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,856,68.0,7890,1939,1950,0.0,238.0,0.0,618.0,856.0,...,TA,TA,Y,,,,3,2010,WD,Normal
1,1049,42.0,4235,1984,1984,149.0,552.0,393.0,104.0,1049.0,...,TA,TA,Y,,,,2,2009,WD,Normal


In [6]:
# Set up preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', 'passthrough', numerical_features_list),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features_list)
])

In [7]:
# fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

In [8]:
# create storage
results = {}

# fit and evaluate the model on all data and score
model.fit(X, np.log(y))
cv_scores = cross_val_score(model, X, np.log10(y))
mean_cv_score = float(round(cv_scores.mean(), 4))

# evaluate on train-test for overfitting analysis
X_train, X_test, y_train, y_test = train_test_split(X, np.log10(y), test_size=0.2, random_state = 0)

model.fit(X_train, y_train)
train_score = (cross_val_score(model, X_train, y_train)).mean()
test_score = (cross_val_score(model, X_test, y_test)).mean()
diff = train_score - test_score

results = {
    'mean-cv on X and y': round(mean_cv_score, 4),
    'train score': round(train_score, 4),
    'test score': round(test_score, 4),
    'train-test gap': round(diff, 4),
    'scenario': 'log-linear base model (79 features)',
    'algorithm': 'MLR'
}

print(f'cv scores', cv_scores)



cv scores [0.87758957 0.91800452 0.9241944  0.93446213 0.92668317]




In [9]:
results

{'mean-cv on X and y': 0.9162,
 'train score': 0.9139,
 'test score': 0.8028,
 'train-test gap': 0.111,
 'scenario': 'log-linear base model (79 features)',
 'algorithm': 'MLR'}

In [10]:
results_df = pd.DataFrame([results])
results_df

Unnamed: 0,mean-cv on X and y,train score,test score,train-test gap,scenario,algorithm
0,0.9162,0.9139,0.8028,0.111,log-linear base model (79 features),MLR


In [11]:
results_df.to_csv('log_linear_base.csv')

In [35]:
cv_scores

array([0.87758957, 0.91800452, 0.9241944 , 0.93446213, 0.92668317])

# Resources
[Return To Top](#Contents)

**Dean De Cock paper and original data:**

- [Ames, Iowa: Alternative to the Boston Housing Data as an
End of Semester Regression Project](https://jse.amstat.org/v19n3/decock.pdf)

- [DataDocumentation.txt](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)

- [Ames Data Dictionary on Github](https://github.com/Padre-Media/dataset/blob/main/Ames%20Data%20Dictionary.txt)