In [112]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

In [109]:
# Read dataset
ames_housing = pd.read_csv("ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [116]:
# Create models and compare them
linear = make_pipeline(StandardScaler(), LinearRegression())
tree = DecisionTreeRegressor()
cv_linear = cross_validate(linear, data_numerical, target, cv=10)
cv_tree = cross_validate(tree, data_numerical, target, cv=10)
score_linear = cv_linear['test_score']
score_tree = cv_tree['test_score']
#print(score_linear > score_tree)
print("Linear regression is better than decision tree for "
    f"{sum(score_linear > score_tree)} CV iterations out of 10 folds.")

Linear regression is better than decision tree for 9 CV iterations out of 10 folds.


In [119]:
# inner and outer CV
max_depth = [i for i in range(1,16)]
param_grid = {'max_depth' : max_depth}
search = GridSearchCV(tree, param_grid=param_grid, cv=10)
cv_results_tree_optimal_depth = cross_validate(search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2)
best = [i.best_params_ for i in cv_results_tree_optimal_depth['estimator']]
best

[{'max_depth': 7},
 {'max_depth': 7},
 {'max_depth': 6},
 {'max_depth': 7},
 {'max_depth': 6},
 {'max_depth': 8},
 {'max_depth': 6},
 {'max_depth': 7},
 {'max_depth': 8},
 {'max_depth': 8}]

In [121]:
tree_opt = cv_results_tree_optimal_depth['test_score']
for i,j in zip(tree_opt, scores_lr):
    print(i>j)
print(
    "A tree with an optimized depth is better than linear regression for "
    f"{sum(cv_results_tree_optimal_depth['test_score'] > scores_lr)} CV "
    "iterations out of 10 folds."
)    

False
False
False
False
False
False
False
True
True
False
A tree with an optimized depth is better than linear regression for 2 CV iterations out of 10 folds.


In [80]:
# Accuracy
search = GridSearchCV(tree, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=-1,
)
cv_results_tree_optimal_depth["test_score"].mean()

0.6909853446444225

In [122]:
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

# Filtering categorical and numerical columns
numerical_selector = selector(dtype_exclude=object)
categorical_selector = selector(dtype_include=object)

num_columns = numerical_selector(data)
cat_columns = categorical_selector(data)

In [124]:
# Preprocessing
cat_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
num_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([('one-hot-encoder', cat_preprocessor, cat_columns),
                                 ('standar_scaler', num_preprocessor, num_columns)])

# creating model
from sklearn import set_config
#set_config(display='diagram')
model = make_pipeline(preprocessor, DecisionTreeRegressor(max_depth=7, random_state=0))

# Crossval and accuracy
cv = cross_validate(model, data, target, cv=10, return_estimator=True, n_jobs=2)
scores = cv['test_score']
print(f'Mean cross validation accuracy: {scores.mean():.3f} +/- {scores.std():.3f}')

Mean cross validation accuracy: 0.767 +/- 0.067


In [125]:
print(
    "A tree with an optimized depth is better than linear regression for "
    f"{sum(cv['test_score'] > cv_results_tree_optimal_depth['test_score'])} CV "
    "iterations out of 10 folds."
)

A tree with an optimized depth is better than linear regression for 10 CV iterations out of 10 folds.


In [105]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

preprocessor = make_column_transformer(
    (categorical_processor, selector(dtype_include=object)),
    ("passthrough", selector(dtype_exclude=object))
)
tree = make_pipeline(preprocessor, DecisionTreeRegressor(max_depth=7, random_state=0))

cv_results = cross_validate(
    tree, data, target, cv=10, return_estimator=True, n_jobs=2
)
cv_results["test_score"].mean()
print(
    "A tree model using both numerical and categorical features is better than a "
    "tree with optimal depth using only numerical features for "
    f"{sum(cv_results['test_score'] > cv_results_tree_optimal_depth['test_score'])} CV "
    "iterations out of 10 folds."
)

A tree model using both numerical and categorical features is better than a tree with optimal depth using only numerical features for 8 CV iterations out of 10 folds.
