Columns for Submission
PID - The property ID
SalePrice - The predicted price of the property

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

In [3]:
df_house = pd.read_csv('./Data/regression_train.csv')
df_house.head()

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD


In [63]:
pipeline_num = Pipeline(
    [("impute", SimpleImputer(strategy='mean')),
     ("scale", StandardScaler())]
)

pipeline_cat = Pipeline(
    [("impute", SimpleImputer(strategy='most_frequent')),
     ("enc", OneHotEncoder(handle_unknown='ignore', sparse_output = False))]
)

ct = ColumnTransformer(
    [("num", pipeline_num, make_column_selector(dtype_include=np.number)),
     ("cat", pipeline_cat, make_column_selector(dtype_include='object'))]
)

pipeline_lr = Pipeline(
    [("preprocessor", ct),
     ("lr", LinearRegression())]
)
params_lr = {}

pipeline_lasso = Pipeline(
    [("preprocessing", ct),
     ("lasso", Lasso(max_iter=10000))]
)
params_lasso = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

pipeline_ridge = Pipeline(
    [("preprocessing", ct),
     ("ridge", Ridge())]
)
params_ridge = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

pipeline_net = Pipeline(
    [("preprocessing", ct),
     ("net", ElasticNet(max_iter=10000))]
)
params_net = {
    'net__alpha': [0.001, 0.01, 0.1, 1],
    'net__l1_ratio': [0.2, 0.5, 0.8]
}

pipeline_forest = Pipeline(
    [("preprocessing",ct),
    ("forest", RandomForestRegressor())
])
params_forest = {
    'forest__n_estimators': [100, 200, 300],
    'forest__max_depth': [None, 10, 20, 30],
    'forest__min_samples_split': [2, 5, 10],
    'forest__min_samples_leaf': [1, 2, 4],
    'forest__max_features': ['auto', 'sqrt']
}

pipeline_xgb = Pipeline(
    [("preprocessor", ct),
     ("xgb", XGBRegressor(objective='reg:squarederror'))]
)
params_xgb = {
    'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'xgb__max_depth': [3, 4, 5, 6, 7],
    'xgb__n_estimators': [100, 200, 300],
    'xgb__subsample': [0.7, 0.8, 0.9]
}

pipelines = {
    "Linear Regression": pipeline_lr,
    "Lasso": pipeline_lasso,
    "Ridge": pipeline_ridge,
    "Elastic Net": pipeline_net,
    "XGBoost": pipeline_xgb
}
params = {
    "Linear Regression": params_lr,
    "Lasso": params_lasso,
    "Ridge": params_ridge,
    "Elastic Net": params_net,
    "XGBoost": params_xgb
}

In [27]:
df_house.columns

Index(['SalePrice', 'PID', 'Lot Frontage', 'Lot Area', 'Street',
       'Neighborhood', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Roof Style', 'Heating', 'Central Air',
       'Electrical', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'TotRms AbvGrd', 'Gr Liv Area', 'Functional', 'Screen Porch',
       'Pool Area', 'Yr Sold', 'Sale Type'],
      dtype='object')

In [48]:
X = df_house.drop(columns=['SalePrice', 'PID'])
y = df_house['SalePrice']

df_house_drop = df_house.dropna()
X_drop = df_house_drop.drop(columns=['SalePrice', 'PID'])
y_drop = df_house_drop['SalePrice']

In [68]:
dropped_features = {
    "all": X,
    "Lot Frontage": X.drop(columns=['Lot Frontage']),
    "Lot Area": X.drop(columns=['Lot Area']),
    "Street": X.drop(columns=['Street']),
    "Neighborhood": X.drop(columns=['Neighborhood']),
    "Bldg Type": X.drop(columns=['Bldg Type']),
    "House Style": X.drop(columns=['House Style']),
    "Overall Qual": X.drop(columns=['Overall Qual']),
    "Overall Cond": X.drop(columns=['Overall Cond']),
    "Year Built": X.drop(columns=['Year Built']),
    "Roof Style": X.drop(columns=['Roof Style']),
    "Heating": X.drop(columns=['Heating']),
    "Central Air": X.drop(columns=['Central Air']),
    "Electrical": X.drop(columns=['Electrical']),
    "Full Bath": X.drop(columns=['Full Bath']),
    "Half Bath": X.drop(columns=['Half Bath']),
    "Bedroom AbvGr": X.drop(columns=['Bedroom AbvGr']),
    "TotRms AbvGrd": X.drop(columns=['TotRms AbvGrd']),
    "Gr Liv Area": X.drop(columns=['Gr Liv Area']),
    "Functional": X.drop(columns=['Functional']),
    "Screen Porch": X.drop(columns=['Screen Porch']),
    "Pool Area": X.drop(columns=['Pool Area']),
    "Yr Sold": X.drop(columns=['Yr Sold']),
    "Sale Type": X.drop(columns=['Sale Type'])
}

In [59]:
def best_features(pipeline, params, features, y):
    scores_dict = {}

    for set_num, X in features.items():
        grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        scores_dict[set_num] = np.sqrt(-grid_search.best_score_)
    
    return sorted(scores_dict.items(), key=lambda x: -x[1])

In [64]:
best_features(pipeline_lr, params_lr, dropped_features, y)

[('Sale Type', 5936658472470982.0),
 ('Neighborhood', 5553204862267517.0),
 ('Bldg Type', 4146481107779573.0),
 ('House Style', 2651704601257575.5),
 ('Bedroom AbvGr', 2432088071269718.5),
 ('Lot Frontage', 1818357499864857.8),
 ('TotRms AbvGrd', 891855195799680.2),
 ('Overall Qual', 530717572846763.8),
 ('Pool Area', 516254884855983.94),
 ('Central Air', 489661907812475.0),
 ('Yr Sold', 473748212267879.8),
 ('Year Built', 441563316557825.8),
 ('Street', 421719540036101.3),
 ('Full Bath', 363299868436811.1),
 ('Functional', 318672488751295.44),
 ('Lot Area', 250221141225709.3),
 ('Electrical', 240047781707271.0),
 ('Gr Liv Area', 235625357685570.62),
 ('Screen Porch', 193944734836373.38),
 ('Heating', 185395841855539.44),
 ('Half Bath', 185011058020581.56),
 ('all', 183493735276526.1),
 ('Roof Style', 173992752383033.94),
 ('Overall Cond', 51127955294396.25)]

In [69]:
best_features(pipeline_net, params_net, dropped_features, y)

[('Neighborhood', 37052.4875060608),
 ('Gr Liv Area', 36164.407040788275),
 ('Overall Qual', 35037.67267520289),
 ('Bldg Type', 34424.1508360744),
 ('House Style', 33827.104871776886),
 ('Overall Cond', 33688.6697569198),
 ('Year Built', 33527.63746873841),
 ('Screen Porch', 33468.386264482266),
 ('Lot Area', 33435.509744105395),
 ('Roof Style', 33389.70612092185),
 ('Bedroom AbvGr', 33320.18659416892),
 ('Half Bath', 33270.440302288895),
 ('Functional', 33260.99590722168),
 ('Full Bath', 33250.95456220777),
 ('Sale Type', 33223.32992511381),
 ('Heating', 33202.67068293646),
 ('Central Air', 33194.53452769674),
 ('Yr Sold', 33187.89042228563),
 ('all', 33187.691396800874),
 ('Electrical', 33183.221558624624),
 ('Street', 33180.19825674381),
 ('TotRms AbvGrd', 33133.41969344548),
 ('Lot Frontage', 33005.252733102985),
 ('Pool Area', 32825.703123230414)]

In [71]:
best_features(pipeline_xgb, params_xgb, dropped_features, y)

In [51]:
for model_name, pipeline in pipelines.items():
    print(model_name)
    grid_search = GridSearchCV(pipeline, params[model_name], cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_drop, y_drop)
    
    mean_rmse = np.sqrt(-grid_search.best_score_)
    print(f"Mean RMSE for price: {mean_rmse}\n")

Linear Regression
Mean RMSE for price: 459834473292325.5

Lasso


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 