In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import joblib
pd.pandas.set_option('display.max_columns', None)

In [149]:
df = pd.read_csv('data/train.csv')
print('shape:', df.shape)
print('')
df.head()

shape: (1460, 81)



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


# Separate dataset into train and test

Always perform train-test split before applying scalers and encoders.

Here's why:

Data Leakage: If you apply these transformations before splitting, you're essentially using information from the test data to transform the training data. This leads to:

Overoptimistic performance: Your model will appear to perform better than it actually would on unseen data.

Poor generalization: The model will not be able to accurately predict on new, unseen data.

Real-world scenario: In a real-world setting, you won't have access to the test data during the model training and feature engineering phase.

Our feature engineering techniques will learn:

- mean
- mode
- exponents for the yeo-johnson
- category frequency
- and category to number mappings

from the train set.

**Separating the data into train and test involves randomness, therefore, we need to set the seed.**

In [150]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Id', 'SalePrice'], axis=1),
    df['SalePrice'],
    test_size=0.1,
    random_state=0,
)

X_train.shape, X_test.shape

((1314, 79), (146, 79))

# Feature Engineering Goals

In the following cells, we will engineer the variables of the House Price Dataset so that we tackle:

1. Missing values
2. Temporal variables
3. Non-Gaussian distributed variables
4. Categorical variables: remove rare labels
5. Categorical variables: convert strings to numbers
5. Put the variables in a similar scale

<h2> 1. Target

In [151]:
y_train = np.log(y_train)
y_test = np.log(y_test)

------------------

<h2> 2. Missing Values



<h3> Categorical variables:
    



We will replace missing values with the string "missing" for variables with a high number of missing entries.  

Alternatively, for variables with fewer missing values, we will use the most frequent category (mode) to maintain consistency.  

A third option is to create a "missing" category only if missing values carry predictive information, otherwise, we might drop the variable or use advanced techniques like KNN or regression imputation.  


In [152]:
categorical_vars = [var for var in df.columns if df[var].dtype == 'O']
categorical_vars = categorical_vars + ['MSSubClass']

X_train[categorical_vars] = X_train[categorical_vars].astype('O')
X_test[categorical_vars] = X_test[categorical_vars].astype('O')

len(categorical_vars)

44

In [153]:
# make a list of the categorical variables that contain missing values
categorical_vars_with_na = [
    var for var in categorical_vars
    if X_train[var].isnull().sum() > 0
]

# print percentage of missing values per variable
X_train[categorical_vars_with_na].isnull().mean().sort_values(ascending=False)

PoolQC          0.995434
MiscFeature     0.961187
Alley           0.938356
Fence           0.814307
FireplaceQu     0.472603
GarageType      0.056317
GarageFinish    0.056317
GarageQual      0.056317
GarageCond      0.056317
BsmtExposure    0.025114
BsmtFinType2    0.025114
BsmtQual        0.024353
BsmtCond        0.024353
BsmtFinType1    0.024353
MasVnrType      0.004566
Electrical      0.000761
dtype: float64

In [154]:
# variables to impute with the string "missing'
with_string_missing = [
    var for var in categorical_vars_with_na if X_train[var].isnull().mean() > 0.1]

# variables to impute with the most frequent category
with_frequent_category = [
    var for var in categorical_vars_with_na if X_train[var].isnull().mean() < 0.1]

In [155]:
X_train[with_string_missing] = X_train[with_string_missing].fillna('Missing')
X_test[with_string_missing] = X_test[with_string_missing].fillna('Missing')


for var in with_frequent_category:
       
    mode = X_train[var].mode()[0]
    
    print(var,':', mode)
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)

MasVnrType : None
BsmtQual : TA
BsmtCond : TA
BsmtExposure : No
BsmtFinType1 : Unf
BsmtFinType2 : Unf
Electrical : SBrkr
GarageType : Attchd
GarageFinish : Unf
GarageQual : TA
GarageCond : TA


<h2> Numerical variables:

To engineer missing values in numerical variables, we will:  

Primary Strategy (Chosen Approach):
- Add a **binary missing indicator variable** (`var_na`) to capture whether a value was missing.  
- Replace the missing values in the original variable with the **mean** of the column.  

<h3> Alternative Strategies: </h3>
    
 Median Imputation:  
- Useful when the data has outliers, as the **median** is less sensitive to extreme values.  

 K-Nearest Neighbors (KNN) Imputation: 
- Predicts missing values based on the nearest neighbors of each sample.  
- More complex but can preserve local relationships in the data.  

 Regression Imputation: 
- Uses a regression model to predict missing values based on other features.  
- Can be useful when missingness is related to other variables.  

<h3> Choosing the Right Method: </h3>
    
- If missingness is random and low, simple imputation (mean/median) works well.  
- If missingness is not random more advanced methods like KNN or regression might be needed.  
- If data has a pattern of missing values adding a **binary missing indicator** helps capture that information.  


In [156]:
numerical_vars = [
    var for var in X_train.columns if var not in categorical_vars and var != 'SalePrice'
]

len(numerical_vars)

35

In [157]:
numerical_vars_with_na = [
    var for var in numerical_vars
    if X_train[var].isnull().sum() > 0
]

X_train[numerical_vars_with_na].isnull().mean()

LotFrontage    0.177321
MasVnrArea     0.004566
GarageYrBlt    0.056317
dtype: float64

In [158]:
for var in numerical_vars_with_na:

    mean_val = X_train[var].mean()
    
    print(var,':', mean_val)

    # add binary missing indicator (in train and test)
    X_train[var + '_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var + '_na'] = np.where(X_test[var].isnull(), 1, 0)

    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True)

LotFrontage : 69.87974098057354
MasVnrArea : 103.7974006116208
GarageYrBlt : 1978.2959677419356


In [159]:
# knn_imputer = KNNImputer(n_neighbors=5)
# for var in numerical_vars_with_na:
    
#     X_train[var + '_na'] = knn_imputer.fit_transform(X_train[[var]])

------------------

<h2> 3. Handle Temporal Variables

We have 4 year variables in the dataset:

- YearBuilt: year in which the house was built
- YearRemodAdd: year in which the house was remodeled
- GarageYrBlt: year in which a garage was built
- YrSold: year in which the house was sold

We generally don't use date variables in their raw format. Instead, we extract information from them. 

For example, we can capture the difference in years between the year the house was built and the year the house was sold.

In [160]:
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    return df

In [161]:
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)

In [162]:
# now we drop YrSold
X_train.drop(['YrSold'], axis=1, inplace=True)
X_test.drop(['YrSold'], axis=1, inplace=True)

-------------

## Numerical variable transformation

### Logarithmic transformation

In the previous notebook, we observed that the numerical variables are not normally distributed.

### Yeo-Johnson transformation

We will apply the Yeo-Johnson transformation to LotArea.

In [163]:
# yj_transform_vars = ["LotArea", "LotFrontage", "1stFlrSF", "GrLivArea", "TotalBsmtSF"]

# for var in yj_transform_vars:
#     # the yeo-johnson transformation learns the best exponent to transform the variable it needs to learn it from the train set: 
#     X_train[var], param = stats.yeojohnson(X_train[var])

#     # and then apply the transformation to the test set with the same parameter
#     X_test[var] = stats.yeojohnson(X_test[var], lmbda=param)


log_transform_vars = ["LotArea", "LotFrontage", "1stFlrSF", "GrLivArea", "TotalBsmtSF"]

for var in log_transform_vars:
    # Apply log1p (log(x + 1)) to avoid issues with zero values
    X_train[var] = np.log1p(X_train[var])
    X_test[var] = np.log1p(X_test[var])


In [164]:
X_test[log_transform_vars]

Unnamed: 0,LotArea,LotFrontage,1stFlrSF,GrLivArea,TotalBsmtSF
529,10.394182,4.260985,7.830426,7.830426,7.618742
491,9.158099,4.382027,6.865891,7.364547,6.693324
459,8.855949,4.260985,6.887553,7.093405,6.565265
279,9.210940,4.430817,7.053586,7.612337,7.057037
655,7.427144,3.091042,6.265301,6.996681,6.265301
...,...,...,...,...,...
1452,8.209580,3.583519,6.978214,6.978214,6.306275
113,9.952325,4.260985,7.723120,7.723120,7.501082
1282,9.082621,4.127134,6.947937,6.947937,6.947937
1163,9.465060,4.110874,7.138073,7.138073,7.089243


--------------

<h2> Binarize skewed variables </h2>

There were a few variables very skewed, we would transform those into binary variables.

<h3> Handling Highly Skewed Features </h3>  

- **Transformation:** Apply log, square root, or Yeo-Johnson transformation to reduce skewness and normalize distributions.  
- **Binarization:** Convert the feature into binary (e.g., 0 vs 1) if it has a strong separation between low and high values.  
- **Discretization:** Group the feature into bins (e.g., quartiles) to make it more interpretable and stable.  
- **Dropping:** If the feature is highly skewed and not informative, removing it might be a valid choice.  
- **Feature Engineering:** Consider creating a new feature that captures meaningful information from the original.  



In [165]:
skewed_variables = [
    'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'MiscVal'
]

for var in skewed_variables:
    
    # map the variable values into 0 and 1
    X_train[var] = np.where(X_train[var]==0, 0, 1)
    X_test[var] = np.where(X_test[var]==0, 0, 1)

------------------

<h2> Categorical Mappings - Encodings 


- **Many categories?** → Use **Ordinal Encoding** (or **Target Encoding** for supervised problems).  
- **Few categories?** → Use **One-Hot/Dummy Encoding**.  
- **Does the variable have a natural order?** → Use **Ordinal Encoding**.  
- **Using linear models?** → Be careful with **Ordinal Encoding**, as it may introduce false relationships.  
- **Need to keep dimensions low?** → Use **Frequency Encoding**.  
- **Want to leverage target variable?** → Use **Target Encoding** (⚠️ with caution to avoid data leakage).  


In [166]:
# re-map strings to numbers, which determine quality

qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}

qual_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
             'HeatingQC', 'KitchenQual', 'FireplaceQu',
             'GarageQual', 'GarageCond',
            ]

for var in qual_vars:
    X_train[var] = X_train[var].map(qual_mappings)
    X_test[var] = X_test[var].map(qual_mappings)

In [167]:
exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

var = 'BsmtExposure'

X_train[var] = X_train[var].map(exposure_mappings)
X_test[var] = X_test[var].map(exposure_mappings)

In [168]:
finish_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

finish_vars = ['BsmtFinType1', 'BsmtFinType2']

for var in finish_vars:
    X_train[var] = X_train[var].map(finish_mappings)
    X_test[var] = X_test[var].map(finish_mappings)

In [169]:
garage_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

var = 'GarageFinish'

X_train[var] = X_train[var].map(garage_mappings)
X_test[var] = X_test[var].map(garage_mappings)

In [170]:
fence_mappings = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}

var = 'Fence'

X_train[var] = X_train[var].map(fence_mappings)
X_test[var] = X_test[var].map(fence_mappings)

<h3 >Removing Rare Labels </h3>

For the remaining categorical variables, we will group those categories that are present in less than 1% of the observations. That is, all values of categorical variables that are shared by less than 1% of houses, well be replaced by the string "Rare".


In [171]:
# capture all quality variables

qual_vars  = qual_vars + finish_vars + ['BsmtExposure','GarageFinish','Fence']

# capture the remaining categorical variables
# (those that we did not re-map)

categorical_others = [
    var for var in categorical_vars if var not in qual_vars
]

len(categorical_others)

30

In [172]:
def find_frequent_labels(df, var, rare_perc):
    
    # function finds the labels that are shared by more than
    # a certain % of the houses in the dataset

    df = df.copy()

    tmp = df.groupby(var)[var].count() / len(df)

    return tmp[tmp > rare_perc].index


for var in categorical_others:
    
    # find the frequent categories
    frequent_ls = find_frequent_labels(X_train, var, 0.01)
    
    print(var, frequent_ls)
    print()
    
    # replace rare categories by the string "Rare"
    X_train[var] = np.where(X_train[var].isin(
        frequent_ls), X_train[var], 'Rare')
    
    X_test[var] = np.where(X_test[var].isin(
        frequent_ls), X_test[var], 'Rare')

MSZoning Index(['FV', 'RH', 'RL', 'RM'], dtype='object', name='MSZoning')

Street Index(['Pave'], dtype='object', name='Street')

Alley Index(['Grvl', 'Missing', 'Pave'], dtype='object', name='Alley')

LotShape Index(['IR1', 'IR2', 'Reg'], dtype='object', name='LotShape')

LandContour Index(['Bnk', 'HLS', 'Low', 'Lvl'], dtype='object', name='LandContour')

Utilities Index(['AllPub'], dtype='object', name='Utilities')

LotConfig Index(['Corner', 'CulDSac', 'FR2', 'Inside'], dtype='object', name='LotConfig')

LandSlope Index(['Gtl', 'Mod'], dtype='object', name='LandSlope')

Neighborhood Index(['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',
       'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes',
       'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW',
       'Somerst', 'StoneBr', 'Timber'],
      dtype='object', name='Neighborhood')

Condition1 Index(['Artery', 'Feedr', 'Norm', 'PosN', 'RRAn'], dtype='object', name='Condition1')

Con

<h2> Ordinal Encoding

In [173]:
# this function will assign discrete values to the strings of the variables,
# so that the smaller value corresponds to the category that shows the smaller
# mean house sale price

def replace_categories(train, test, y_train, var, target):
    
    tmp = pd.concat([X_train, y_train], axis=1)
    
    # order the categories in a variable from that with the lowest
    # house sale price, to that with the highest
    ordered_labels = tmp.groupby([var])[target].mean().sort_values().index

    # create a dictionary of ordered categories to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}
    
    print(var, ordinal_label)
    print()

    # use the dictionary to replace the categorical strings by integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [174]:
for var in categorical_others:
    replace_categories(X_train, X_test, y_train, var, 'SalePrice')

MSZoning {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}

Street {'Rare': 0, 'Pave': 1}

Alley {'Grvl': 0, 'Pave': 1, 'Missing': 2}

LotShape {'Reg': 0, 'IR1': 1, 'Rare': 2, 'IR2': 3}

LandContour {'Bnk': 0, 'Lvl': 1, 'Low': 2, 'HLS': 3}

Utilities {'Rare': 0, 'AllPub': 1}

LotConfig {'Inside': 0, 'FR2': 1, 'Corner': 2, 'Rare': 3, 'CulDSac': 4}

LandSlope {'Gtl': 0, 'Mod': 1, 'Rare': 2}

Neighborhood {'IDOTRR': 0, 'MeadowV': 1, 'BrDale': 2, 'Edwards': 3, 'BrkSide': 4, 'OldTown': 5, 'Sawyer': 6, 'SWISU': 7, 'NAmes': 8, 'Mitchel': 9, 'SawyerW': 10, 'Rare': 11, 'NWAmes': 12, 'Gilbert': 13, 'Blmngtn': 14, 'CollgCr': 15, 'Crawfor': 16, 'ClearCr': 17, 'Somerst': 18, 'Timber': 19, 'StoneBr': 20, 'NridgHt': 21, 'NoRidge': 22}

Condition1 {'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRAn': 3, 'Rare': 4, 'PosN': 5}

Condition2 {'Rare': 0, 'Norm': 1}

BldgType {'2fmCon': 0, 'Duplex': 1, 'Twnhs': 2, '1Fam': 3, 'TwnhsE': 4}

HouseStyle {'SFoyer': 0, '1.5Fin': 1, 'Rare': 2, '1Story': 3, 'SLvl': 4, '2Story'

In [175]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
930,9,3,4.304065,9.096724,1,2,1,3,1,0,0,19,2,1,3,3,8,5,2,2,0,0,10,10,1,0.0,4,3,4,4,3,3,6,16,1,0,1450,7.290975,2,5,1,3,7.290975,0,0,7.290975,0,0,2,0,3,1,4,7,4,0,0,3,2.0,3,3,610,3,3,2,100,18,0,0,0,0,0,0,2,0,7,2,3,0,0,0
656,9,3,4.290459,9.211140,1,2,1,1,1,0,0,8,2,1,3,3,5,7,49,2,0,0,6,6,2,54.0,4,3,2,3,3,1,5,806,1,0,247,6.960348,2,5,1,3,6.960348,0,0,6.960348,1,0,1,1,3,1,4,5,4,0,0,3,49.0,2,1,312,3,3,2,0,0,0,0,0,0,0,3,2,0,8,2,3,0,0,0
45,11,3,4.127134,8.943637,1,2,0,1,1,0,0,21,2,1,4,3,9,5,5,5,2,0,3,2,2,412.0,5,3,4,5,3,1,6,456,1,0,1296,7.469084,2,5,1,3,7.469084,0,0,7.469084,1,0,2,0,2,1,5,6,4,1,4,3,5.0,2,2,576,3,3,2,196,82,0,0,0,0,0,0,2,0,2,2,3,0,0,0
1348,9,3,4.260985,9.692581,1,2,2,2,1,0,0,10,2,1,3,3,7,5,9,9,0,0,10,10,1,0.0,4,3,4,4,3,4,6,1443,1,0,39,7.301822,2,5,1,3,7.309881,0,0,7.309881,1,0,2,0,3,1,4,5,4,1,2,3,9.0,2,2,514,3,3,2,402,25,0,0,0,0,0,0,2,0,8,2,3,1,0,0
55,9,3,4.615121,9.227787,1,2,1,1,1,0,0,8,2,1,3,3,6,5,44,44,0,0,6,7,2,272.0,3,3,2,3,3,1,4,490,1,0,935,7.262629,2,4,1,3,7.262629,0,0,7.262629,0,0,2,0,3,1,3,7,4,1,4,3,44.0,2,2,576,3,3,2,0,0,0,1,0,0,0,0,2,0,7,2,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,12,3,4.418841,9.151757,1,2,0,1,1,0,0,22,2,1,3,5,8,5,10,10,0,0,10,10,2,673.0,4,3,4,4,3,2,6,1163,1,0,89,7.133296,2,5,1,3,7.145984,1097,0,7.768956,1,0,2,1,3,1,4,8,4,1,4,3,10.0,2,3,856,3,3,2,0,128,0,0,1,0,0,0,2,0,7,2,3,0,0,0
835,9,3,4.110874,9.169623,1,2,0,1,1,0,0,6,2,1,3,3,4,7,60,15,0,0,10,6,1,0.0,3,3,2,4,3,1,4,442,1,0,625,6.973543,2,3,1,3,6.973543,0,0,6.973543,0,0,2,0,2,1,4,4,1,0,0,3,14.0,1,2,436,3,3,2,290,0,0,0,0,0,0,0,2,0,2,2,3,0,0,0
1216,3,1,4.234107,9.097284,1,2,0,1,1,0,0,6,4,1,1,1,6,5,32,32,0,0,10,10,1,0.0,3,3,0,3,3,1,1,0,1,0,0,0.000000,2,3,1,3,7.184629,584,0,7.551187,0,0,2,0,4,2,3,8,4,0,0,3,32.0,1,2,539,3,3,2,0,0,0,0,0,0,0,0,2,0,4,2,3,0,0,0
559,11,3,4.260985,8.069968,1,2,0,1,1,0,0,14,2,1,4,3,7,5,3,2,0,0,10,10,2,18.0,4,3,4,4,3,4,1,0,1,0,1374,7.226209,2,5,1,3,7.351158,0,0,7.351158,0,0,2,0,2,1,4,7,4,1,3,3,3.0,3,2,420,3,3,2,143,20,0,0,0,0,0,0,2,0,10,2,3,1,0,0


In [176]:
path = 'preprocessed_data/'
X_train.to_csv(path + 'xtrain_preprocessed.csv', index=False)
X_test.to_csv(path + 'xtest_preprocessed.csv', index=False)

y_train.to_csv(path + 'ytrain_preprocessed.csv', index=False)
y_test.to_csv(path + 'ytest_preprocessed.csv', index=False)

--------------------

<h2> Feature Scaling </h2>  

1. Min-Max Scaling (Normalization)  
Scales features to a fixed range, usually [0,1]. Suitable for neural networks and distance-based models.  


2. Standardization (Z-score Normalization)  
Centers data around zero with unit variance. Works well with linear models and PCA.  


In [177]:
scaler = StandardScaler()

scaler = MinMaxScaler(feature_range=(0,1))
# Fit the scaler to the train set
scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [178]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,0.75,0.75,0.456307,0.376975,1.0,1.0,0.333333,1.0,1.0,0.0,0.0,0.863636,0.4,1.0,0.75,0.6,0.777778,0.5,0.014706,0.04918,0.0,0.0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,0.666667,1.0,0.002835,0.0,0.0,0.673479,0.836328,1.0,1.0,1.0,1.0,0.559475,0.0,0.0,0.522954,0.0,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.0,0.0,0.75,0.018692,1.0,0.75,0.430183,0.5,0.5,1.0,0.116686,0.032907,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.545455,0.666667,0.75,0.0,0.0,0.0
1,0.75,0.75,0.451188,0.399372,1.0,1.0,0.333333,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,0.444444,0.75,0.360294,0.04918,0.0,0.0,0.6,0.6,0.666667,0.03375,0.666667,0.5,0.5,0.333333,0.666667,0.0,0.8,0.142807,0.0,0.0,0.114724,0.798402,1.0,1.0,1.0,1.0,0.434223,0.0,0.0,0.405878,0.333333,0.0,0.333333,0.5,0.375,0.333333,0.666667,0.25,1.0,0.0,0.0,0.75,0.457944,0.5,0.25,0.220028,0.5,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,1.0,0.0,0.636364,0.666667,0.75,0.0,0.0,0.0
2,0.916667,0.75,0.38975,0.347009,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.954545,0.4,1.0,1.0,0.6,0.888889,0.5,0.036765,0.098361,1.0,0.0,0.3,0.2,0.666667,0.2575,1.0,0.5,1.0,1.0,0.666667,0.0,1.0,0.080794,0.0,0.0,0.601951,0.856758,1.0,1.0,1.0,1.0,0.626948,0.0,0.0,0.586023,0.333333,0.0,0.666667,0.0,0.25,0.333333,1.0,0.333333,1.0,0.333333,0.8,0.75,0.046729,0.5,0.5,0.406206,0.5,0.5,1.0,0.228705,0.149909,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.090909,0.666667,0.75,0.0,0.0,0.0
3,0.75,0.75,0.440101,0.493613,1.0,1.0,0.666667,0.666667,1.0,0.0,0.0,0.454545,0.4,1.0,0.75,0.6,0.666667,0.5,0.066176,0.163934,0.0,0.0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,1.0,1.0,0.25567,0.0,0.0,0.018114,0.837572,1.0,1.0,1.0,1.0,0.566637,0.0,0.0,0.529649,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.25,1.0,0.333333,0.4,0.75,0.084112,0.5,0.5,0.362482,0.5,0.5,1.0,0.469078,0.045704,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.636364,0.666667,0.75,1.0,0.0,0.0
4,0.75,0.75,0.573317,0.402631,1.0,1.0,0.333333,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,0.555556,0.5,0.323529,0.737705,0.0,0.0,0.6,0.7,0.666667,0.17,0.333333,0.5,0.5,0.333333,0.666667,0.0,0.6,0.086818,0.0,0.0,0.434278,0.833076,1.0,0.75,1.0,1.0,0.548737,0.0,0.0,0.512917,0.0,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,1.0,0.333333,0.8,0.75,0.411215,0.5,0.5,0.406206,0.5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.545455,0.666667,0.75,0.0,0.0,0.0


<h2> Saving the preprocessed datasets and the scaler

In [179]:
path = 'preprocessed_data/'
X_train.to_csv(path + 'xtrain_scaled.csv', index=False)
X_test.to_csv(path + 'xtest_scaled.csv', index=False)

y_train.to_csv(path + 'ytrain_scaled.csv', index=False)
y_test.to_csv(path + 'ytest_scaled.csv', index=False)

joblib.dump(scaler, path + 'minmax_scaler.joblib') 

['preprocessed_data/minmax_scaler.joblib']