In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

from category_encoders.ordinal import OrdinalEncoder

from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
houses = pd.read_csv('train2.csv')
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1455 entries, 0 to 1454
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1455 non-null   int64  
 1   MSSubClass     1455 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1198 non-null   float64
 4   LotArea        1455 non-null   int64  
 5   Street         1455 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1455 non-null   object 
 8   LandContour    1455 non-null   object 
 9   Utilities      1455 non-null   object 
 10  LotConfig      1455 non-null   object 
 11  LandSlope      1455 non-null   object 
 12  Neighborhood   1455 non-null   object 
 13  Condition1     1455 non-null   object 
 14  Condition2     1455 non-null   object 
 15  BldgType       1455 non-null   object 
 16  HouseStyle     1455 non-null   object 
 17  OverallQual    1455 non-null   int64  
 18  OverallC

In [3]:
houses['GarageYrBlt'] = houses['GarageYrBlt'].astype(str)
houses['MSSubClass'] = houses['MSSubClass'].astype(str)

In [4]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1455 entries, 0 to 1454
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1455 non-null   int64  
 1   MSSubClass     1455 non-null   object 
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1198 non-null   float64
 4   LotArea        1455 non-null   int64  
 5   Street         1455 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1455 non-null   object 
 8   LandContour    1455 non-null   object 
 9   Utilities      1455 non-null   object 
 10  LotConfig      1455 non-null   object 
 11  LandSlope      1455 non-null   object 
 12  Neighborhood   1455 non-null   object 
 13  Condition1     1455 non-null   object 
 14  Condition2     1455 non-null   object 
 15  BldgType       1455 non-null   object 
 16  HouseStyle     1455 non-null   object 
 17  OverallQual    1455 non-null   int64  
 18  OverallC

In [5]:
X = houses.drop(columns='SalePrice')
y = houses['SalePrice'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")

Train set size: (1164, 80)
Validation set size: (291, 80)


In [6]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
freq_imputer = SimpleImputer(strategy='most_frequent', add_indicator=True)
median_imputer = SimpleImputer(strategy='median')
na_imputer = SimpleImputer(strategy='constant', fill_value='NA')
zero_imputer = SimpleImputer(strategy='constant', fill_value= 0)
scaler = StandardScaler()


In [7]:
# Ordinal Map

dict_ex_to_po = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
dict_func = {'Typ':8,	'Min1':7,	'Min2':6,	'Mod':5, 'Maj1':4, 'Maj2':3, 'Sev':2,	'Sal':1}

ordinal_map = [
  {'col': 'ExterQual','mapping':dict_ex_to_po},
  {'col': 'ExterCond','mapping':dict_ex_to_po},
  {'col': 'HeatingQC','mapping':dict_ex_to_po},
  {'col': 'KitchenQual','mapping':dict_ex_to_po},
  {'col': 'Functional','mapping':dict_func}
]

ord = OrdinalEncoder(mapping = ordinal_map)

In [8]:
na_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType',
               'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MasVnrType', 'MiscFeature',
               'MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
               'Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','Foundation','Heating','CentralAir',
               'PavedDrive','SaleType','SaleCondition'
               ]

mode_cols = ['Electrical']

zero_cols = ['MasVnrArea']

median_cols = ['LotFrontage']

ohe_cols = na_cols + mode_cols

ord_cols = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 'Functional']

num_remove = ['Id', 'MSSubClass', 'SalePrice', 'GarageYrBlt']

nums_cols = list(houses.select_dtypes('number').columns.difference(num_remove))

scale_cols = ord_cols + nums_cols

In [9]:
ohe_df = houses[['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType',
               'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MasVnrType', 'MiscFeature',
               'MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
               'Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','Foundation','Heating','CentralAir',
               'PavedDrive','SaleType','SaleCondition'
               ]
]

In [10]:
ohe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1455 entries, 0 to 1454
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Alley          91 non-null     object
 1   BsmtQual       1418 non-null   object
 2   BsmtCond       1418 non-null   object
 3   BsmtExposure   1418 non-null   object
 4   BsmtFinType1   1418 non-null   object
 5   BsmtFinType2   1418 non-null   object
 6   FireplaceQu    767 non-null    object
 7   GarageType     1374 non-null   object
 8   GarageYrBlt    1455 non-null   object
 9   GarageFinish   1374 non-null   object
 10  GarageQual     1374 non-null   object
 11  GarageCond     1374 non-null   object
 12  PoolQC         7 non-null      object
 13  Fence          281 non-null    object
 14  MasVnrType     588 non-null    object
 15  MiscFeature    53 non-null     object
 16  MSSubClass     1455 non-null   object
 17  MSZoning       1455 non-null   object
 18  Street         1455 non-null

In [11]:
len(houses.select_dtypes('number').columns)

36

In [12]:
len(nums_cols)


34

In [13]:
len(scale_cols)

39

In [14]:
len(na_cols)

39

In [15]:
na_pipe = Pipeline ([
  ('NA Imputer', na_imputer),
  ('OHE', ohe)
])

mode_pipe = Pipeline([
  ('Freq Imputer', freq_imputer),
  ('OHE', ohe)
])

# ord_pipe = Pipeline([
#   ('Ordinal Encode', ord)
#   ('Standard Scaler', StandardScaler())
# ])

na_tuple = ('NA Fill', na_pipe, na_cols)

mode_tuple = ('Mode Fill', mode_pipe, mode_cols)

ord_tuple = ('Ord Encode', ord, ord_cols)

In [16]:
# Convert garage year to string for NA imputation

def garage_yr_convert(df):
  df= df.copy()
  df['GarageYrBlt'] = df['GarageYrBlt'].astype(str)
  return df

convert_garage = FunctionTransformer(garage_yr_convert)

def ms_class_convert(df):
  df= df.copy()
  df['MSSubClass'] = df['MSSubClass'].astype(str)
  return df

convert_ms_sub = FunctionTransformer(ms_class_convert)

In [17]:
# Function to add calculated columns

def calc_cols(df):
  df = df.copy()
  df['BedBath'] = (df['Full Bath'] + df['HalfBath']) / df['BedroomAbvGr']
  df['AreaRatio'] = df['GrLivArea'] / df['LotArea']
  return df

add_cols = FunctionTransformer(calc_cols)

In [18]:
preprocessor = ColumnTransformer([
  # ('convert_garage', convert_garage, ['GarageYrBlt']),
  # ('convert_ms_sub', convert_ms_sub, ['MSSubClass']),
  ('median_impute', median_imputer, median_cols),
  ('zero_imputer', zero_imputer, zero_cols),
  na_tuple, mode_tuple, ord_tuple,
  ('standard_scaler', StandardScaler(), scale_cols)
],remainder='drop', verbose_feature_names_out=False)

In [19]:
main_pipe = Pipeline([
  ('preprocessor', preprocessor),
  ('add_cols', add_cols)
])

In [20]:
ord.fit(houses)

In [21]:
# processed = ord.transform(houses)

In [22]:
# processed['ExterQual'].value_counts()

In [23]:
# ord.fit(X_train)

In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1164 entries, 720 to 1126
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1164 non-null   int64  
 1   MSSubClass     1164 non-null   object 
 2   MSZoning       1164 non-null   object 
 3   LotFrontage    948 non-null    float64
 4   LotArea        1164 non-null   int64  
 5   Street         1164 non-null   object 
 6   Alley          74 non-null     object 
 7   LotShape       1164 non-null   object 
 8   LandContour    1164 non-null   object 
 9   Utilities      1164 non-null   object 
 10  LotConfig      1164 non-null   object 
 11  LandSlope      1164 non-null   object 
 12  Neighborhood   1164 non-null   object 
 13  Condition1     1164 non-null   object 
 14  Condition2     1164 non-null   object 
 15  BldgType       1164 non-null   object 
 16  HouseStyle     1164 non-null   object 
 17  OverallQual    1164 non-null   int64  
 18  OverallCond

In [25]:
main_pipe.fit(X_train)

ValueError: X does not contain the columns listed in cols