In [1]:
import pandas as pd

ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [22]:
# Q1

linear_model = make_pipeline(StandardScaler(), LinearRegression())
linear_cv = cross_validate(linear_model, data_numerical, target, return_train_score=True, cv=10) 

tree = DecisionTreeRegressor()
tree_cv = cross_validate(tree, data_numerical, target, return_train_score=True, cv=10) 

In [19]:
print(linear_cv["test_score"])

[0.76129977 0.80617644 0.81188989 0.66592199 0.79964891 0.76868787
 0.75635094 0.71822127 0.31479306 0.78635221]


In [21]:
linear_cv_results = [score for score in linear_cv["test_score"]]
tree_cv_results = [score for score in tree_cv["test_score"]]

for i, j in zip(linear_cv_results, tree_cv_results):
    print(i > j)

True
True
True
True
True
True
True
True
False
True


In [23]:
# Q2
from sklearn.model_selection import GridSearchCV

In [33]:
max_depth_range = list(range(1, 16))
param_grid = {"max_depth": max_depth_range}

tree_grid_cv = GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid, cv=10)

tree_cv_results_grid = cross_validate(tree_grid_cv, data_numerical, target, cv=10, return_estimator=True)

In [36]:
opt_depths = [cv_result.best_params_ for cv_result in tree_cv_results_grid["estimator"]]
print(opt_depths)

[{'max_depth': 7}, {'max_depth': 8}, {'max_depth': 10}, {'max_depth': 6}, {'max_depth': 7}, {'max_depth': 6}, {'max_depth': 6}, {'max_depth': 6}, {'max_depth': 7}, {'max_depth': 6}]


In [41]:
# Q3
opt_scores = [score for score in tree_cv_results_grid["test_score"]]
print(opt_scores)

opt_scores2 = [cv_result.best_score_ for cv_result in tree_cv_results_grid["estimator"]]
print(opt_scores2)

[0.5917337945748162, 0.6894036175770164, 0.7526404306144214, 0.6159378661217656, 0.7666182670842291, 0.7382025906224221, 0.6893723792090307, 0.7605357639335458, 0.526708102142986, 0.736763324014172]
[0.5917337945748162, 0.6894036175770164, 0.7526404306144214, 0.6159378661217656, 0.7666182670842291, 0.7382025906224221, 0.6893723792090307, 0.7605357639335458, 0.526708102142986, 0.736763324014172]
[0.7007137018993956, 0.6913307163011921, 0.7038814216831045, 0.7192595928227965, 0.6849326878531716, 0.6860204289552512, 0.6717485522956247, 0.6846081076806249, 0.694464970025691, 0.6427433884383407]


In [42]:
for i, j in zip(opt_scores2, linear_cv_results):
    print(i > j)

False
False
False
True
False
False
False
False
True
False


In [51]:
# Q4
data_numerical = data.select_dtypes(exclude=["object"])
data_category = data.select_dtypes(include=["object"])

print(data_numerical.shape, data_category.shape)
print(data_category.columns)
print(data_numerical.columns)

(1460, 36) (1460, 43)
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       

In [99]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), data_category.columns),
    ("passthrough", data_numerical.columns)
)

tree_pipeline = make_pipeline(preprocessor, DecisionTreeRegressor(max_depth=7))

In [100]:
tree_cv_with_category = cross_validate(tree_pipeline, data, target, 
    return_train_score=True, cv=10, return_estimator=True) 

In [109]:
tree_cv_results = [score for score in tree_cv_with_category["test_score"]]

In [110]:
for i, j in zip(tree_cv_results, linear_cv_results):
    print(i > j)

False
False
True
False
True
True
True
True
True
False
