In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer

In [32]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X = X.iloc[:, :-2]
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467


In [3]:
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
r2_score(y, y_pred)

0.5396977345638283

### Transformer Box-Cox

In [5]:
power_transformer = PowerTransformer()

In [6]:
power_transformer.fit(X)

In [7]:
power_transformer.lambdas_

array([-0.19850989,  0.80814809, -0.5536698 , -4.39408222,  0.23352364,
       -0.90134563])

In [8]:
X_tr = power_transformer.transform(X)

In [9]:
model = LinearRegression()
model.fit(X_tr, y)
y_pred = model.predict(X_tr)
r2_score(y, y_pred)

0.5682548331196868

In [10]:
power_transformer.inverse_transform

<bound method PowerTransformer.inverse_transform of PowerTransformer()>

In [11]:
def test_transformation(transformation, X_train, X_test, y_train, y_test):
    model = TransformedTargetRegressor(LinearRegression(),
                                       func=transformation.get("func"),
                                       inverse_func=transformation.get("inverse_func"),
                                       transformer=transformation.get("transformer")
                                      )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    result_dict =  {
        'transformer': transformation.get("transformer"),
        'transform_func': None if transformation.get("func") is None else transformation.get("func").__name__,
        'inverse_func': None if transformation.get("inverse_func") is None else transformation.get("inverse_func").__name__,
        'r2': r2
    }
    return {key: value for key, value in result_dict.items() if value is not None}

In [12]:
def test_transformations(transformations: list[dict], X_train, X_test, y_train, y_test):
    return pd.DataFrame([test_transformation(transformation, X_train, X_test, y_train, y_test) for transformation in transformations])

In [13]:
target_transformations = [{'func': None, 'inverse_func': None},
                          {'func': np.log, 'inverse_func': np.exp},
                          {'func': np.sqrt, 'inverse_func': np.square},
                          {'transformer': PowerTransformer()}
                         ]

target_transformations

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
test_transformations(target_transformations, X_train, X_test, y_train, y_test)

Unnamed: 0,r2,transform_func,inverse_func,transformer
0,0.507548,,,
1,-0.369162,log,exp,
2,0.463253,sqrt,square,
3,-0.882632,,,PowerTransformer()
