In [3]:
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from typing import Optional, List

continuous_columns = ['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add', 'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Low_Qual_Fin_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr', 'TotRms_AbvGrd', 'Fireplaces', 'Garage_Cars', 'Garage_Area', 'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch', 'Pool_Area', 'Misc_Val', 'Mo_Sold', 'Year_Sold', 'Longitude', 'Latitude']
class BaseDataPreprocessor(TransformerMixin):
    def __init__(self, needed_columns: Optional[List[str]]=None):
        self.needed_columns = needed_columns
        self.scaler = StandardScaler()

    def fit(self, data, *args):
        if self.needed_columns != None:
            needed_data = data[self.needed_columns]
        else:
            needed_data = data
        self.scaler.fit(needed_data)
        return self
    def transform(self, data: pd.DataFrame) -> np.array:
        if self.needed_columns != None:
            needed_data = data[self.needed_columns]
        else:
            needed_data = data
        transformed_data = self.scaler.transform(needed_data)
        return transformed_data

interesting_columns = ["Overall_Qual", "Garage_Qual", "Sale_Condition", "MS_Zoning"]

class OneHotPreprocessor(BaseDataPreprocessor):
    def __init__(self, **kwargs):
        super().__init__()
        self.continuous_columns = continuous_columns
        self.encoder = OneHotEncoder(handle_unknown='ignore', **kwargs)
    
    def fit(self, X, y=None):
        cat_features = X[interesting_columns]
        self.encoder.fit(cat_features)
        super().fit(X[self.continuous_columns])
        return self
    
    def transform(self, X):
        cat_features = X[interesting_columns]
        cat_features_encoded = self.encoder.transform(cat_features).toarray()
        continuous_features = super().transform(X[self.continuous_columns])
        return np.concatenate([continuous_features, cat_features_encoded], axis=1)

def make_ultimate_pipeline():
    ultimate_pipeline = Pipeline([('Preprocessing', OneHotPreprocessor()), ('regression', Ridge())])
    return ultimate_pipeline

In [4]:
data = pd.read_csv('./data.csv')

data.sample(20)

FileNotFoundError: [Errno 2] No such file or directory: './data.csv'