# Import Libraries and Config Project

In [2]:
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

from feature_engine.imputation import (
    MeanMedianImputer,
    AddMissingIndicator,
    CategoricalImputer,
)

from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder

from feature_engine.transformation import LogTransformer, YeoJohnsonTransformer

from feature_engine.wrappers import SklearnTransformerWrapper

import preprocessors as pp

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
pd.pandas.set_option("display.max_columns", None)
warnings.simplefilter(action="ignore", category=FutureWarning)

# Load Dataset

In [4]:
data_df = pd.read_csv("dataset/train.csv", index_col=False)
data_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    data_df.drop(columns=["SalePrice"]),
    data_df["SalePrice"],
    test_size=0.1,
    random_state=0,
    shuffle=True,
)

# Clean Data

In [6]:
data_df.drop(["Id"], axis=1, inplace=True)
data_df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


# Feature Engineering

### Target Variable

In [7]:
y_train = np.log(y_train)
y_test = np.log(y_test)

### Categorical Variables

In [8]:
x_train["MSSubClass"] = x_train["MSSubClass"].astype("O")
x_test["MSSubClass"] = x_test["MSSubClass"].astype("O")

In [9]:
# categorical variable with NA in train, missing values in this group will be replaced by the most frequent value
CAT_VARS_NA_FREQUENT = ["BsmtQual", "BsmtExposure", "BsmtFinType1", "GarageFinish"]


# categorical variable with NA in train, missing values in this group will be replaced by a missingValue label
CAT_VARS_NA_MISSING = [
    "FireplaceQu",
]

# categorical variables that their values are quality. we map these variables.
CAT_QUAL_VARS = ["ExterQual", "BsmtQual", "HeatingQC", "KitchenQual", "FireplaceQu"]

CAT_VARS_EXPOSURE = ["BsmtExposure"]

CAT_VARS_FINISH = ["BsmtFinType1"]

CAT_VARS_GARAGE = ["GarageFinish"]

CAT_VARS_OTHER = [
    "MSSubClass",
    "MSZoning",
    "LotShape",
    "LandContour",
    "LotConfig",
    "Neighborhood",
    "RoofStyle",
    "Exterior1st",
    "Foundation",
    "CentralAir",
    "Functional",
    "PavedDrive",
    "SaleCondition",
]

In [10]:
QUAL_MAPPINGS = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "Missing": 0, "NA": 0}

EXPOSURE_MAPPINGS = {"No": 1, "Mn": 2, "Av": 3, "Gd": 4}

FINISH_MAPPINGS = {
    "Missing": 0,
    "NA": 0,
    "Unf": 1,
    "LwQ": 2,
    "Rec": 3,
    "BLQ": 4,
    "ALQ": 5,
    "GLQ": 6,
}

GARAGE_MAPPINGS = {"Missing": 0, "NA": 0, "Unf": 1, "RFn": 2, "Fin": 3}

FENCE_MAPPINGS = {"Missing": 0, "NA": 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}

### Numercial Variables

In [11]:
# numerical variable with missing values (NA) in train
NUM_VARS_NA = ["LotFrontage"]

# temporal vars
NUML_VARS_TEMPORA = ["YearRemodAdd"]
REF_VAR = "YrSold"

# numerical variables that we want to transform by log transformation
NUM_VARS_LOG = ["LotFrontage", "1stFlrSF", "GrLivArea"]

# numerical variables that we want to transform by binary transformation
NUM_VARS_BINARY = ["ScreenPorch"]

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso


prediction_pipeline = Pipeline(
    [
        (
            "cat_missing_imputation",
            CategoricalImputer(
                imputation_method="missing", variables=CAT_VARS_NA_MISSING
            ),
        ),
        (
            "cat_frequent_imputation",
            CategoricalImputer(
                imputation_method="frequent", variables=CAT_VARS_NA_FREQUENT
            ),
        ),
        ("num_missing_indicator", AddMissingIndicator(variables=NUM_VARS_NA)),
        (
            "num_mean_imputation",
            MeanMedianImputer(imputation_method="mean", variables=NUM_VARS_NA),
        ),
        ("num_log", LogTransformer(variables=NUM_VARS_LOG)),
        (
            "num_binerizer",
            SklearnTransformerWrapper(
                transformer=Binarizer(threshold=0.1), variables=NUM_VARS_BINARY
            ),
        ),
        ("mapper_qual", pp.Mapper(variables=CAT_QUAL_VARS, mappings=QUAL_MAPPINGS)),
        (
            "mapper_exposure",
            pp.Mapper(variables=CAT_VARS_EXPOSURE, mappings=EXPOSURE_MAPPINGS),
        ),
        (
            "mapper_finish",
            pp.Mapper(variables=CAT_VARS_FINISH, mappings=FINISH_MAPPINGS),
        ),
        (
            "mapper_garage",
            pp.Mapper(variables=CAT_VARS_GARAGE, mappings=GARAGE_MAPPINGS),
        ),
        (
            "rare_label_encoder",
            RareLabelEncoder(tol=0.1, n_categories=1, variables=CAT_VARS_OTHER),
        ),
        (
            "categorical_encoder",
            OrdinalEncoder(encoding_method="ordered", variables=CAT_VARS_OTHER),
        ),
        ("scalar", MinMaxScaler()),
        ("Lasso", Lasso(alpha=0.001, random_state=0)),
    ]
)

# Feature Selection

In [13]:
# the selected variables based on the 3rd stage (feature selection)
FEATURES = [
    "MSSubClass",
    "MSZoning",
    "LotFrontage",
    "LotShape",
    "LandContour",
    "LotConfig",
    "Neighborhood",
    "OverallQual",
    "OverallCond",
    "YearRemodAdd",
    "RoofStyle",
    "Exterior1st",
    "ExterQual",
    "Foundation",
    "BsmtQual",
    "BsmtExposure",
    "BsmtFinType1",
    "HeatingQC",
    "CentralAir",
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea",
    "BsmtFullBath",
    "HalfBath",
    "KitchenQual",
    "TotRmsAbvGrd",
    "Functional",
    "Fireplaces",
    "FireplaceQu",
    "GarageFinish",
    "GarageCars",
    "GarageArea",
    "PavedDrive",
    "WoodDeckSF",
    "ScreenPorch",
    "SaleCondition",
    "YrSold",
]

# Fit Model

In [14]:
x_train = x_train[FEATURES]
x_test = x_test[FEATURES]


prediction_pipeline.fit(x_train, y_train)

In [15]:
train_pred = prediction_pipeline.predict(x_train)

train_msr = mean_squared_error(np.exp(y_train), np.exp(train_pred))
train_rmsr = mean_squared_error(np.exp(y_train), np.exp(train_pred), squared=False)
train_r2 = r2_score(np.exp(y_train), np.exp(train_pred))

print("Train msr:", train_msr)
print("Train rmsr:", train_rmsr)
print("Train r2:", train_r2)
print("Train Average house price: ", int(np.exp(y_train).median()))

Train msr: 972318396.1276922
Train rmsr: 31182.02039842339
Train r2: 0.8442753720235135
Train Average house price:  163000


# Evaluate Model

In [16]:
test_pred = prediction_pipeline.predict(x_test)

test_msr = mean_squared_error(np.exp(y_test), np.exp(test_pred))
test_rmsr = mean_squared_error(np.exp(y_test), np.exp(test_pred), squared=False)
test_r2 = r2_score(np.exp(y_test), np.exp(test_pred))


print("Test msr:", test_msr)
print("Test rmsr:", test_rmsr)
print("Test r2:", test_r2)
print("Test Average house price: ", int(np.exp(y_test).median()))

Test msr: 976236101.0990734
Test rmsr: 31244.777181139783
Test r2: 0.8579424150529655
Test Average house price:  161200


# Predict New Data

In [17]:
new_data = pd.read_csv("dataset/test.csv", index_col=False)
new_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


### Clean Data

drop Id and columns with null values

In [18]:
new_data.drop("Id", axis=1, inplace=True)
new_data["MSSubClass"] = new_data["MSSubClass"].astype("O")
new_data = new_data[FEATURES]

In [19]:
var_with_na = [
    var
    for var in FEATURES
    if (new_data[var].isnull().sum() > 0)
    and var not in CAT_VARS_NA_FREQUENT + CAT_VARS_NA_MISSING + NUM_VARS_NA
]


print(var_with_na)
new_data[var_with_na].head()

['MSZoning', 'Exterior1st', 'BsmtFullBath', 'KitchenQual', 'Functional', 'GarageCars', 'GarageArea']


Unnamed: 0,MSZoning,Exterior1st,BsmtFullBath,KitchenQual,Functional,GarageCars,GarageArea
0,RH,VinylSd,0.0,TA,Typ,1.0,730.0
1,RL,Wd Sdng,0.0,Gd,Typ,1.0,312.0
2,RL,VinylSd,0.0,TA,Typ,2.0,482.0
3,RL,VinylSd,0.0,Gd,Typ,2.0,470.0
4,RL,HdBoard,0.0,Gd,Typ,2.0,506.0


In [20]:
new_data.dropna(subset=var_with_na, inplace=True)

In [31]:
new_preds = prediction_pipeline.predict(new_data[FEATURES])
np.power(new_preds, 2)

array([134.35962882, 142.31210862, 145.91989195, ..., 143.81860028,
       136.64876215, 152.31637196])