In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn import set_config
set_config(display='diagram')

In [2]:
housing = pd.read_csv('../datasets/train_clean.csv')

log transformation of sale price

In [3]:
housing['price_log'] = np.log(housing['saleprice'])

In [4]:
# remove outliers

housing = housing[housing['1st_flr_sf'] < 3000]
housing = housing[(housing['total_bsmt_sf'] > 0) & (housing['total_bsmt_sf'] < 3000)]
housing = housing[housing['gr_liv_area'] < 4000]
housing = housing[housing['lot_area'] < 40000]
housing = housing[housing['garage_area'] > 0]
housing = housing[housing['2nd_flr_sf'] > 0]

housing = housing[(housing['saleprice'] > 50_000) & (housing['saleprice'] < 500_000)]

columns with continious values to log transform

In [5]:
all_cols = [
    "lot_frontage",
    "lot_area",
    "mas_vnr_area",
    "bsmtfin_sf_1",
    "bsmtfin_sf_2",
    "bsmt_unf_sf",
    "total_bsmt_sf",
    "1st_flr_sf",
    "2nd_flr_sf",
    "low_qual_fin_sf",
    "gr_liv_area",
    "garage_area",
    "wood_deck_sf",
    "open_porch_sf",
    "enclosed_porch",
    "3ssn_porch",
    "screen_porch",
    "pool_area",
]

log transformation function

In [6]:
def log_transform(df, cols):
    for col in cols:
        df[col] = np.log(df[col] + 1)

log_transform(housing, all_cols)

X = Features to be trained on

y = What the model is trying to predict

In [7]:
X = housing[
    [
        "overall_qual",
        "gr_liv_area",
        'exter_qual',
        "kitchen_qual",
        'garage_cars',
        'bsmt_qual',
        "1st_flr_sf",
        'year_remod_add',
        'full_bath',
        'fireplace_qu',
        "total_bsmt_sf",
        "neighborhood"
    ]
]
y = housing["price_log"]

Splitting data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [9]:
# column transformer that uses OneHotEncoder to encode nominal features
ct = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# pipeline that transforms columns, fills missing values in data using KNNInputer, generates polynomial features and feature interactions, scales data, and uses a Ridge regression model
pipe = make_pipeline(ct, KNNImputer(), PolynomialFeatures(), StandardScaler(), Ridge())


# parameters used in the grid search
params = {
    'polynomialfeatures__degree': [1, 2, 3],
    'polynomialfeatures__interaction_only': [True, False], 
    'ridge__alpha': [1, 10, 50, 100],
}

# grid searches to find the best model and parameters
gs = GridSearchCV(pipe, params)
gs.fit(X_train, y_train)

gs.best_params_

{'polynomialfeatures__degree': 1,
 'polynomialfeatures__interaction_only': True,
 'ridge__alpha': 100}

#### Scores

In [10]:
preds = gs.predict(X_test)

print('Train R2: ', gs.score(X_train, y_train))
print(' Test R2: ', gs.score(X_test, y_test))
print('    RMSE: ', mean_squared_error(np.exp(y_test), np.exp(preds), squared=False))

Train R2:  0.8852986810693276
 Test R2:  0.8767951002907464
    RMSE:  24125.783935157233


#### Coefficients for features used

In [11]:
coefs = gs.best_estimator_.named_steps['ridge'].coef_
cols = gs.best_estimator_.named_steps['columntransformer'].get_feature_names_out()
pd.DataFrame(zip(cols, coefs)).sort_values(1)

Unnamed: 0,0,1
17,neighborhood_SWISU,-0.030395
7,neighborhood_Gilbert,-0.020042
10,neighborhood_Mitchel,-0.019711
9,neighborhood_MeadowV,-0.01008
13,neighborhood_NWAmes,-0.010005
24,overall_qual,-0.009849
2,neighborhood_BrkSide,-0.009713
1,neighborhood_BrDale,-0.005635
18,neighborhood_Sawyer,-0.005245
12,neighborhood_NPkVill,-0.00179


### Loop to run multiple tests of a model

In [12]:
# scores = []

# for n in range(1,51):

#     X_train, X_test, y_train, y_test = train_test_split(X, y)

#     gs.fit(X_train, y_train)
#     preds = gs.predict(X_test)
#     scores.append(mean_squared_error(np.exp(y_test), np.exp(preds), squared=False))

# scores

In [13]:
# np.mean(scores)

Ridge with log transformation

Mean of 50 tests = 24735.858909685216