In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from df_after_transform import df_after_transform
from sklearn import set_config
from sklearn.calibration import CalibrationDisplay
from sklearn.compose import (
    ColumnTransformer,
    make_column_selector,
    make_column_transformer,
)
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier,HistGradientBoostingRegressor
from sklearn.feature_selection import (
    RFECV,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (Lasso, 
                                 LassoCV, 
                                 LogisticRegression,
                                 Ridge,
                                 LinearRegression
)

from sklearn.metrics import (
    ConfusionMatrixDisplay,
    DetCurveDisplay,
    PrecisionRecallDisplay,
    RocCurveDisplay,
    classification_report,
    make_scorer,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVC

set_config(display="diagram")  # display='text' is the default

pd.set_option(
    "display.max_colwidth", 1000, "display.max_rows", 50, "display.max_columns", None
)

import warnings

warnings.filterwarnings("ignore")

In [2]:
housing_train = pd.read_csv('input_data2/housing_train.csv')

y_train = np.log(housing_train['v_SalePrice'])
X_train = housing_train.drop(columns = 'v_SalePrice')

In [12]:
# numerical and categorical pipelines
numer_pipe = Pipeline(
    [
        ("imputer",SimpleImputer(strategy='median')),
        ("scaler",StandardScaler()),
        ("polynomial",PolynomialFeatures(degree=1))
    ])

cat_pipe = Pipeline(
    [
        ("encoder",OneHotEncoder(handle_unknown='ignore'))
    ])

# numerical and categorical variables to use
numer_variables = X_train.select_dtypes(include='number').columns
numer_variables = [var for var in numer_variables]

cat_variables = ['v_Lot_Config','v_Neighborhood','v_MS_Zoning','v_Land_Contour','v_Bldg_Type','v_House_Style',
                 'v_Exter_Qual','v_Heating_QC','v_Central_Air','v_Kitchen_Qual','v_Paved_Drive','v_Sale_Condition']

# preprocessing pipeline
preproc_pipe = ColumnTransformer(
    [
        ("numer",numer_pipe,numer_variables),
        ("cat",cat_pipe,cat_variables)
    ],
        remainder='drop')

# preprocessing dataframe
preproc_df = df_after_transform(preproc_pipe,X_train)
print(f"There are {preproc_df.shape[1]} columns in the dataframe")
preproc_df

There are 118 columns in the dataframe


Unnamed: 0,1,v_MS_SubClass,v_Lot_Frontage,v_Lot_Area,v_Overall_Qual,v_Overall_Cond,v_Year_Built,v_Year_Remod/Add,v_Mas_Vnr_Area,v_BsmtFin_SF_1,v_BsmtFin_SF_2,v_Bsmt_Unf_SF,v_Total_Bsmt_SF,v_1st_Flr_SF,v_2nd_Flr_SF,v_Low_Qual_Fin_SF,v_Gr_Liv_Area,v_Bsmt_Full_Bath,v_Bsmt_Half_Bath,v_Full_Bath,v_Half_Bath,v_Bedroom_AbvGr,v_Kitchen_AbvGr,v_TotRms_AbvGrd,v_Fireplaces,v_Garage_Yr_Blt,v_Garage_Cars,v_Garage_Area,v_Wood_Deck_SF,v_Open_Porch_SF,v_Enclosed_Porch,v_3Ssn_Porch,v_Screen_Porch,v_Pool_Area,v_Misc_Val,v_Mo_Sold,v_Yr_Sold,v_Lot_Config_Corner,v_Lot_Config_CulDSac,v_Lot_Config_FR2,v_Lot_Config_FR3,v_Lot_Config_Inside,v_Neighborhood_Blmngtn,v_Neighborhood_Blueste,v_Neighborhood_BrDale,v_Neighborhood_BrkSide,v_Neighborhood_ClearCr,v_Neighborhood_CollgCr,v_Neighborhood_Crawfor,v_Neighborhood_Edwards,v_Neighborhood_Gilbert,v_Neighborhood_Greens,v_Neighborhood_GrnHill,v_Neighborhood_IDOTRR,v_Neighborhood_Landmrk,v_Neighborhood_MeadowV,v_Neighborhood_Mitchel,v_Neighborhood_NAmes,v_Neighborhood_NPkVill,v_Neighborhood_NWAmes,v_Neighborhood_NoRidge,v_Neighborhood_NridgHt,v_Neighborhood_OldTown,v_Neighborhood_SWISU,v_Neighborhood_Sawyer,v_Neighborhood_SawyerW,v_Neighborhood_Somerst,v_Neighborhood_StoneBr,v_Neighborhood_Timber,v_Neighborhood_Veenker,v_MS_Zoning_A (agr),v_MS_Zoning_C (all),v_MS_Zoning_FV,v_MS_Zoning_I (all),v_MS_Zoning_RH,v_MS_Zoning_RL,v_MS_Zoning_RM,v_Land_Contour_Bnk,v_Land_Contour_HLS,v_Land_Contour_Low,v_Land_Contour_Lvl,v_Bldg_Type_1Fam,v_Bldg_Type_2fmCon,v_Bldg_Type_Duplex,v_Bldg_Type_Twnhs,v_Bldg_Type_TwnhsE,v_House_Style_1.5Fin,v_House_Style_1.5Unf,v_House_Style_1Story,v_House_Style_2.5Fin,v_House_Style_2.5Unf,v_House_Style_2Story,v_House_Style_SFoyer,v_House_Style_SLvl,v_Exter_Qual_Ex,v_Exter_Qual_Fa,v_Exter_Qual_Gd,v_Exter_Qual_TA,v_Heating_QC_Ex,v_Heating_QC_Fa,v_Heating_QC_Gd,v_Heating_QC_Po,v_Heating_QC_TA,v_Central_Air_N,v_Central_Air_Y,v_Kitchen_Qual_Ex,v_Kitchen_Qual_Fa,v_Kitchen_Qual_Gd,v_Kitchen_Qual_TA,v_Paved_Drive_N,v_Paved_Drive_P,v_Paved_Drive_Y,v_Sale_Condition_Abnorml,v_Sale_Condition_AdjLand,v_Sale_Condition_Alloca,v_Sale_Condition_Family,v_Sale_Condition_Normal,v_Sale_Condition_Partial
0,1.0,-0.887124,1.730884,0.460549,1.346426,-0.522693,1.181307,1.148564,1.801615,2.104651,-0.290447,-0.585803,1.497042,1.383238,-0.785375,-0.099745,0.388256,1.136066,-0.254966,0.784149,-0.758515,-1.046560,-0.194053,-0.294952,0.630147,1.148173,1.612394,1.812309,-0.728086,0.751910,-0.351787,-0.100391,-0.286425,-0.07752,-0.085328,-1.978953,1.249543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,-0.887124,1.320007,0.308958,1.346426,-0.522693,1.181307,1.100560,1.074734,1.256717,-0.290447,0.005941,1.205097,1.060692,-0.785375,-0.099745,0.144275,1.136066,-0.254966,0.784149,-0.758515,0.161871,-0.194053,0.339047,-0.927961,1.148173,1.612394,1.992051,0.917742,0.652306,-0.351787,-0.100391,-0.286425,-0.07752,-0.085328,-1.978953,1.249543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,-0.887124,2.050454,0.577020,2.773740,-0.522693,1.181307,1.148564,3.862921,2.620404,-0.290447,-0.285379,2.336384,2.325676,-0.785375,-0.099745,1.101139,1.136066,-0.254966,0.784149,-0.758515,-1.046560,-0.194053,0.339047,0.630147,1.148173,1.612394,3.443813,0.752371,-0.059153,-0.351787,-0.100391,-0.286425,-0.07752,-0.085328,-0.157122,1.249543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,-0.887124,2.598290,0.360553,-0.794546,-0.522693,-0.672866,-1.587616,-0.563462,-0.954904,-0.290447,-1.291344,-2.403163,0.163612,-0.785375,-0.099745,-0.534299,-0.805047,-0.254966,-1.025641,1.247314,-1.046560,-0.194053,0.339047,0.630147,-1.091228,0.301817,-0.335376,0.642125,-0.357965,1.487787,-0.100391,3.732367,-0.07752,-0.085328,-0.157122,0.001928,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,-0.887124,1.228701,0.275754,1.346426,-0.522693,1.214417,1.148564,1.747370,1.964785,-0.290447,0.065116,2.003384,1.942653,-0.785375,-0.099745,0.811411,1.136066,-0.254966,0.784149,-0.758515,-1.046560,-0.194053,0.339047,0.630147,1.188162,1.612394,1.388303,-0.728086,0.239660,-0.351787,13.464342,-0.286425,-0.07752,-0.085328,1.664709,1.249543,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1936,1.0,3.072354,0.452602,0.360808,-0.794546,0.397114,0.022449,-0.579550,0.217664,1.147447,-0.290447,-0.856639,0.226624,0.080456,-0.785375,-0.099745,-0.597200,1.136066,-0.254966,0.784149,-0.758515,-1.046560,-0.194053,-0.294952,-0.927961,-0.251453,0.301817,0.130109,0.374382,1.477598,-0.351787,-0.100391,-0.286425,-0.07752,-0.085328,0.207244,-1.245686,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1937,1.0,3.072354,-0.049581,-0.409024,-0.794546,2.236728,-1.831724,0.524523,-0.563462,-0.954904,-0.290447,0.269950,-0.838519,-0.536916,1.472012,-0.099745,0.803787,-0.805047,-0.254966,0.784149,-0.758515,2.578735,4.761963,2.241044,-0.927961,0.068462,-2.319335,-2.178883,-0.728086,-0.699465,3.143403,-0.100391,-0.286425,-0.07752,-0.085328,0.207244,-1.245686,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1938,1.0,3.072354,-0.414804,0.065800,-0.794546,-1.442500,-2.361487,-0.675556,-0.563462,0.496199,-0.290447,-0.631322,-0.227259,1.524351,0.707264,-0.099745,1.730154,-0.805047,-0.254966,0.784149,-0.758515,0.161871,4.761963,2.241044,0.630147,-0.331431,0.301817,0.217676,0.390132,-0.699465,-0.351787,-0.100391,-0.286425,-0.07752,-0.085328,-0.521489,-1.245686,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1939,1.0,3.072354,-0.643069,-0.587178,-0.794546,0.397114,-1.964165,0.764538,-0.563462,-0.495971,-0.290447,0.005941,-0.624122,-0.567155,1.011320,-0.099745,0.399693,1.136066,-0.254966,0.784149,-0.758515,3.787166,-0.194053,1.607045,-0.927961,0.068462,-2.319335,-2.178883,-0.728086,1.918702,-0.351787,-0.100391,-0.286425,-0.07752,-0.085328,-1.250221,1.249543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
# pipeline
pipe3 = Pipeline(
    [
        ('preproc',preproc_pipe),
        ('feature_select',SelectKBest(k=50)),
        ('clf',HistGradientBoostingRegressor(max_iter=100,max_bins=100, min_samples_leaf=10,max_leaf_nodes=20,learning_rate=0.081579))
    ])
pipe3

In [14]:
pipe3.get_params()

{'memory': None,
 'steps': [('preproc',
   ColumnTransformer(transformers=[('numer',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler()),
                                                    ('polynomial',
                                                     PolynomialFeatures(degree=1))]),
                                    ['v_MS_SubClass', 'v_Lot_Frontage',
                                     'v_Lot_Area', 'v_Overall_Qual',
                                     'v_Overall_Cond', 'v_Year_Built',
                                     'v_Year_Remod/Add', 'v_Mas_Vnr_Area',
                                     'v_BsmtFin_SF_1', 'v_BsmtFin_SF_2',
                                     'v_Bsmt_...
                                     'v_Garage_Yr_Blt', 'v_Garage_Cars',
                                     'v_Garage

In [20]:
param_grid3 = [
    {'feature_select__k': [100,117]}]
grid_search3 = GridSearchCV(estimator = pipe3, 
                           param_grid = param_grid3,
                           cv = 5, 
                           scoring='r2'
                           )

In [21]:
results3 = grid_search3.fit(X_train, y_train)
pd.DataFrame(results3.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.145578,0.326793,0.031756,0.000942,100,{'feature_select__k': 100},0.893739,0.879664,0.899199,0.871873,0.842838,0.877463,0.019861,1
1,3.07476,1.526707,0.026911,0.014309,117,{'feature_select__k': 117},0.89288,0.885878,0.90277,0.871708,,,,2


In [7]:
scores = cross_validate(pipe3,X_train,y_train,scoring='r2',cv=5)
scores['test_score'].mean()

0.850517812712021

In [None]:
grid_search3.best_score_