In [1]:
import numpy as np
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Proprocessing
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# XGBoost Model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import optuna

from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# For reproducibility
from teachopencadd.utils import seed_everything
SEED = 42
seed_everything(SEED)

In [2]:
train_file_path_UMAP = '../Data/Data/train_filter_UMAP.csv'
train_df_UMAP = pd.read_csv(train_file_path_UMAP)

# Pipelines

In [3]:
ode_cols = ['LotShape', 'LandContour', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC']
ohe_cols = ['MSZoning', 'Street', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'SaleType', 'SaleCondition']
num_cols = train_df_UMAP.select_dtypes(include=['float64', 'int64']).columns
num_cols = num_cols.drop("SalePrice")

In [4]:
# from the sklearn.pipeline module
# assembles several steps that can be cross-validated together while setting different parameters.
# steps parameter is a list of tuples where each tuple represents a step in the pipeline

#impute is the name of the step
### SimpleImputer is a method from sklearn.impute module, fills in missing values in dataset
### strategy='mean' is the parameter, missing value replaceded by the mean value of corresponding feature

### scaler is the name of the next step
### StandardScaler is a method from sklearn.preprocessing module
### It standardizes features by removing the mean and scaling to unit variance
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

#impute is the name of the step
### SimpleImputer is a method from sklearn.impute module, fills in missing values in dataset
### strategy='most frequent' is the parameter, missing value replaceded by the most frequent value of corresponding feature

### ohe is the name of the next step
### OneHotEncoder is a method from sklearn.preprocessing module and encodes categorical features as a one-hot numeric array
### handle_unknown='ignore' is the parameter, unknown categories are ignored
### sparse_output=False is a parameter, returns a dense array instead of a sparse matrix
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

### ode is the name of the step
### OrdinalEncoder is a method from sklearn.preprocessing module and encodes categorical features as an ordinal numeric array
### handle_unknown='use_encoded_value' is the parameter, unknown categories are encoded with a user-specified value
### where the value is -1, set by unknown_value=-1
ode_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))

])

### ColumnTransformer is a method from sklearn.compose module
### Used to apply preprocessing pipelines to subsets of features in a dataset
### 'num_p', 'ode_p', 'ohe_p' are the names of the transformers
### num_pipeline, ode_pipeline, ohe_pipeline are the pipelines
### num_cols, ode_cols, ohe_cols are the columns to apply the pipelines to

### remaining columns should be passed through without any changes
# n_jobs=-1 uses all processors available
col_trans = ColumnTransformer(transformers=[
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ode_pipeline, ode_cols),
    ('ohe_p', ohe_pipeline, ohe_cols),
    ],
    remainder='passthrough',
    n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

In [5]:
X = train_df_UMAP.drop('SalePrice', axis=1)
Y = train_df_UMAP['SalePrice']

# Pass the data through the pipeline
X_preprocessed = pipeline.fit_transform(X)

# Train test split
x_train_UMAP, x_test_UMAP, y_train_UMAP, y_test_UMAP = train_test_split(X_preprocessed, Y, test_size=0.2, random_state=SEED)

# Optuna: Define Objective function

In [6]:
def objective_UMAP(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 750)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 5e-2, 0.5)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 5)
    gamma = trial.suggest_discrete_uniform('gamma', 0, 0.5, 0.1)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1)

    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, random_state=SEED)
    model.fit(x_train_UMAP, y_train_UMAP)

    score = cross_val_score(model, x_train_UMAP, y_train_UMAP, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()
    return score



# XGB Regressor

{'n_estimators': 100,
 'max_depth': 10,
 'learning_rate': 0.05503466451418128,
 'min_child_weight': 5,
 'gamma': 0.4,
 'subsample': 0.5,
 'colsample_bytree': 0.6}

In [7]:
best_params_UMAP = {'n_estimators': 100,
 'max_depth': 10,
 'learning_rate': 0.05503466451418128,
 'min_child_weight': 5,
 'gamma': 0.4,
 'subsample': 0.5,
 'colsample_bytree': 0.6}
best_n_estimators_UMAP = best_params_UMAP['n_estimators']
best_max_depth_UMAP = best_params_UMAP['max_depth']
best_learning_rate_UMAP = best_params_UMAP['learning_rate']
best_min_child_weight_UMAP = best_params_UMAP['min_child_weight']
best_gamma_UMAP = best_params_UMAP['gamma']
best_subsample_UMAP = best_params_UMAP['subsample']
best_colsample_bytree_UMAP = best_params_UMAP['colsample_bytree']

best_XGB_UMAP = XGBRegressor(n_estimators=best_n_estimators_UMAP, max_depth=best_max_depth_UMAP, learning_rate=best_learning_rate_UMAP, min_child_weight=best_min_child_weight_UMAP, gamma=best_gamma_UMAP, subsample=best_subsample_UMAP, colsample_bytree=best_colsample_bytree_UMAP, random_state=SEED)
best_XGB_UMAP.fit(x_train_UMAP, y_train_UMAP)