In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("dataset/dataset_01_train")
df = df.drop(["Unnamed: 0", "car_ID"], axis=1)
df.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,toyota corolla tercel,gas,std,two,hatchback,rwd,front,94.5,168.7,...,98,mpfi,3.24,3.08,9.4,112,6600,26,29,9538.0
1,1,porsche cayenne,gas,std,two,hatchback,rwd,front,98.4,175.7,...,203,mpfi,3.94,3.11,10.0,288,5750,17,28,31400.5
2,0,jaguar xf,gas,std,four,sedan,rwd,front,113.0,199.6,...,258,mpfi,3.63,4.17,8.1,176,4750,15,19,35550.0
3,1,honda civic (auto),gas,std,two,sedan,fwd,front,96.5,169.1,...,110,2bbl,3.15,3.58,9.1,100,5500,25,31,10345.0
4,-1,volvo 264gl,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625.0


In [3]:
X = df.drop("price", axis=1)
y = df["price"]

In [4]:
numeric_columns = X.select_dtypes(include="number").columns
categoric_features = X.select_dtypes(exclude="number")

In [5]:
main_condition = (categoric_features.nunique() > 10)

ordinal_columns = categoric_features[main_condition[main_condition].index].columns
ohe_columns = categoric_features[main_condition[~main_condition].index].columns

In [6]:
numeric_transform = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

ordinal_transform = Pipeline(
    steps=[
        ("ordinal_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)),
    ]
)

ohe_transform = Pipeline(
    steps=[
        ("ohe_encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transform, numeric_columns),
        ("cat_ord", ordinal_transform, ordinal_columns),
        ("cat_ohe", ohe_transform, ohe_columns),
    ]
)

In [7]:
model = Pipeline(
    steps=[("preprocessor", preprocessor)]
)

In [65]:
model.fit(X)
model.transform(X)

array([[ 0.5       , -0.68206794, -0.50062918, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.5       , -0.16150728, -0.05647148, ...,  0.        ,
         0.        ,  1.        ],
       [-0.75      ,  1.78725825,  1.4600098 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.5       , -0.68206794, -0.71636292, ...,  0.        ,
         1.        ,  0.        ],
       [-0.75      , -0.36172292, -0.18971879, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.5       , -0.05472561,  0.1211916 , ...,  0.        ,
         0.        ,  1.        ]])

In [8]:
X_test = pd.read_csv("dataset/dataset_01_inferences_X")
X_test = X_test.drop(["Unnamed: 0", "car_ID"], axis=1)

In [9]:
model.transform(X_test)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [42]:
model.transform(X_test)

array([[ 9.29570918e-01, -8.60063957e-01, -1.38855092e+00,
        -7.14353936e-01, -1.21495103e+00, -1.19109656e+00,
        -8.39362306e-01, -1.37537071e+00, -8.06192804e-02,
        -1.83178272e-01, -9.23334321e-01,  7.67494478e-01,
         9.26579207e-01,  1.09453926e+00,             nan,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0