In [7]:
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from typing import Optional, List

continuous_columns = ['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add', 'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Low_Qual_Fin_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr', 'TotRms_AbvGrd', 'Fireplaces', 'Garage_Cars', 'Garage_Area', 'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch', 'Pool_Area', 'Misc_Val', 'Mo_Sold', 'Year_Sold', 'Longitude', 'Latitude']
class BaseDataPreprocessor(TransformerMixin):
    def __init__(self, needed_columns: Optional[List[str]]=None):
        self.needed_columns = needed_columns
        self.scaler = StandardScaler()

    def fit(self, data, *args):
        if self.needed_columns != None:
            needed_data = data[self.needed_columns]
        else:
            needed_data = data
        self.scaler.fit(needed_data)
        return self
    def transform(self, data: pd.DataFrame) -> np.array:
        if self.needed_columns != None:
            needed_data = data[self.needed_columns]
        else:
            needed_data = data
        transformed_data = self.scaler.transform(needed_data)
        return transformed_data

interesting_columns = ["Overall_Qual", "Garage_Qual", "Sale_Condition", "MS_Zoning"]

class OneHotPreprocessor(BaseDataPreprocessor):
    def __init__(self, **kwargs):
        super().__init__()
        self.continuous_columns = continuous_columns
        self.encoder = OneHotEncoder(handle_unknown='ignore', **kwargs)
    
    def fit(self, X, y=None):
        cat_features = X[interesting_columns]
        self.encoder.fit(cat_features)
        super().fit(X[self.continuous_columns])
        return self
    
    def transform(self, X):
        cat_features = X[interesting_columns]
        cat_features_encoded = self.encoder.transform(cat_features).toarray()
        continuous_features = super().transform(X[self.continuous_columns])
        return np.concatenate([continuous_features, cat_features_encoded], axis=1)

def make_ultimate_pipeline():
    ultimate_pipeline = Pipeline([('Preprocessing', OneHotPreprocessor()), ('regression', Ridge())])
    return ultimate_pipeline

In [3]:
data = pd.read_csv('./data.csv')

data.sample(20)

Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,Utilities,Lot_Config,...,Fence,Misc_Feature,Misc_Val,Mo_Sold,Year_Sold,Sale_Type,Sale_Condition,Sale_Price,Longitude,Latitude
2325,Two_Story_1946_and_Newer,Residential_Low_Density,63,9084,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,No_Fence,,0,6,2006,WD,Normal,176500,-93.639293,42.058201
600,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,65,8125,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,9,2009,WD,Normal,118000,-93.622465,42.042241
582,Two_Story_1946_and_Newer,Residential_Low_Density,0,10382,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,Shed,350,11,2009,WD,Normal,200000,-93.631971,42.048761
1798,Two_Story_1946_and_Newer,Floating_Village_Residential,75,9000,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,8,2007,New,Partial,303477,-93.647933,42.050301
2128,Two_Story_1946_and_Newer,Residential_Low_Density,47,10820,Pave,No_Alley_Access,Moderately_Irregular,Lvl,AllPub,CulDSac,...,No_Fence,,0,3,2007,WD,Normal,235500,-93.688952,42.017907
2003,One_Story_1946_and_Newer_All_Styles,Residential_Medium_Density,52,8626,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,5,2007,WD,Normal,104500,-93.604176,42.02726
403,Two_Story_PUD_1946_and_Newer,Residential_Medium_Density,24,2368,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,5,2009,WD,Normal,125000,-93.628119,42.052338
234,One_and_Half_Story_Finished_All_Ages,Residential_Low_Density,124,18600,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,Shed,450,6,2010,WD,Normal,124000,-93.671178,42.023079
619,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,80,9600,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,6,2009,WD,Normal,128900,-93.624612,42.037452
83,Duplex_All_Styles_and_Ages,Residential_Medium_Density,68,8930,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,4,2010,WD,Normal,112000,-93.677205,42.036535


In [5]:
from sklearn.model_selection import train_test_split
target_column = "Sale_Price"
seed = 24
np.random.seed(seed)

test_size = 0.2
data_train, data_test, Y_train, Y_test = train_test_split(
    data[data.columns.drop("Sale_Price")],
    np.array(data["Sale_Price"]),
    test_size=test_size,
    random_state=seed)

print(f"Train : {data_train.shape} {Y_train.shape}")
print(f"Test : {data_test.shape} {Y_test.shape}")

Train : (2344, 80) (2344,)
Test : (586, 80) (586,)


In [8]:
pipe = make_ultimate_pipeline()
pipe.fit(data_train, Y_train)
prediction = pipe.predict(data_test)
print(Y_test.shape, prediction.shape)
print("MAE : ", mean_absolute_error(Y_test, prediction))
print("Mean log : ", root_mean_squared_logarithmic_error(Y_test, prediction))

(586,) (586,)


NameError: name 'mean_absolute_error' is not defined