In [1]:
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error

from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression

### Pipeline Experiment

In [13]:
# Create a dataframe

df = pd.DataFrame(columns=['X1', 'X2', 'y'],
                  data = [
                          [1, 16, 9],
                          [4, 36, 16],
                          [1, 16, 9],
                          [2, 9, 8],
                          [3, 36, 15],
                          [2, 49, 16],
                          [4, 25, 14],
                          [5, 36, 17] ])

# Y = x1 + 2*SQRT(X2)
df.head()

Unnamed: 0,X1,X2,y
0,1,16,9
1,4,36,16
2,1,16,9
3,2,9,8
4,3,36,15


In [14]:
train = df.iloc[:6]
test = df.iloc[6:]

# Splitting the dataframe
train_X = train.drop('y', axis=1)
train_y = train['y']

test_X = test.drop('y', axis=1)
test_y = test['y']

In [15]:
# Lets see if the linear regresion is able to predict this properly
m1 = LinearRegression()
fit1= m1.fit(train_X, train_y)
preds = fit1.predict(test_X)

print(f"{preds}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds))}")

[13.72113586 16.93334467]
RMSE : 0.20274138822160603


In [16]:
train_X['X2'] = 2 * np.sqrt(train_X['X2'])
test_X['X2'] = 2 * np.sqrt(test_X['X2'])
print(test_X)

#Building a model
m2 = LinearRegression()
fit2 = m2.fit(train_X, train_y)
preds = fit2.predict(test_X)

#Evaluate
print(f"{preds}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds))}")

   X1    X2
6   4  10.0
7   5  12.0
[14. 17.]
RMSE : 1.2560739669470201e-15


In [17]:
# Let's recreate the data and do the above using a transformer
train = df.iloc[:6]
test = df.iloc[6:]

# Splitting the dataframe
train_X = train.drop('y', axis=1)
train_y = train['y']

test_X = test.drop('y', axis=1)
test_y = test['y']

In [18]:
# Writing a simple pipeline
pipeline = Pipeline(steps=[('linearModel', LinearRegression())])
pipeline.fit(train_X, train_y)
preds = pipeline.predict(test_X)

#Evaluate
print(f"{preds}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds))}")

[13.72113586 16.93334467]
RMSE : 0.20274138822160603


In [19]:
# When we are writing a tranformer, it should have 3 methods
# the init, fit and transform

class ExperimentalTransformer(BaseEstimator, TransformerMixin):

  def __init__(self):
    print("Initialization was done")

  def fit(self, X, y=None):
    print("Parameters Learned")
    return self

  def transform(self, X, y= None):
    X_ = X.copy() # Avoids making changes to the original dataset
    X_['X2'] = 2 * np.sqrt(X_['X2'])
    print("Transformation Done")
    return X_

The drawback of the above transformer is that, the column `X2` has been hard coded into the code. It is not a good practice. Therefore, let's create a transformer that would handle this.

In [21]:
pipeline2 = Pipeline(steps=[
                            ('experimentalTransformer', ExperimentalTransformer()), # Thid will trigget init
                            ('linearModel', LinearRegression())
])

pipeline2.fit(train_X, train_y) # When fit is called, both fit and transform are called.
preds2 = pipeline2.predict(test_X)

#Evaluate
print(f"{preds2}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds2))}")

Initialization was done
Parameters Learned
Transformation Done
Transformation Done
[14. 17.]
RMSE : 1.2560739669470201e-15


In [22]:
class ExperimentalTransformer2(BaseEstimator, TransformerMixin):

  def __init__(self, feature_name, additional_param="Default Value"):
    print("Object initialized")
    self.feature_name = feature_name
    self.additional_param = additional_param

  def fit(self, X, y = None):
    print("Parameters Learned")
    print(f"Additional params: {self.additional_param}")
    return self

  def transform(self, X, y= None):
    X_ = X.copy() # Avoids making changes to the original dataset
    X_[self.feature_name] = 2 * np.sqrt(X_[self.feature_name])
    print("Transformation Done")
    return X_

In [24]:
pipeline3 = Pipeline(steps=[('experimentalTransformer2', ExperimentalTransformer2('X2')),
                            ('linearModel', LinearRegression())])

pipeline3.fit(train_X, train_y)
preds3 = pipeline3.predict(test_X)

#Evaluate
print(f"{preds3}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds3))}")

Object initialized
Parameters Learned
Additional params: Default Value
Transformation Done
Transformation Done
[14. 17.]
RMSE : 1.2560739669470201e-15


So far we have transformed the input features, now let's see how it is done for target features

In [25]:
# Create a dataframe

df = pd.DataFrame(columns=['X1', 'X2', 'y'],
                  data = [
                          [1, 16, 81],
                          [4, 36, 256],
                          [1, 16, 81],
                          [2, 9, 64],
                          [3, 36, 225],
                          [2, 49, 256],
                          [4, 25, 196],
                          [5, 36, 289] ])

# Y = x1 + 2*SQRT(X2)
df.head()

Unnamed: 0,X1,X2,y
0,1,16,81
1,4,36,256
2,1,16,81
3,2,9,64
4,3,36,225


In [27]:
train = df.iloc[:6]
test = df.iloc[6:]

# Splitting the dataframe
train_X = train.drop('y', axis=1)
train_y = train['y']

test_X = test.drop('y', axis=1)
test_y = test['y']

pipeline3 = Pipeline(steps=[('experimentalTransformer2', ExperimentalTransformer2('X2')),
                            ('linearModel', LinearRegression())])

pipeline3.fit(train_X, train_y)
preds3 = pipeline3.predict(test_X)

#Evaluate
print(f"{preds3}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds3))}")

# The reason we don't get a good prediction is because, 
# This is not a linear relationship

Object initialized
Parameters Learned
Additional params: Default Value
Transformation Done
Transformation Done
[207.42690058 280.94152047]
RMSE : 9.887192456534327


In [28]:
# We'll now write a custom target transformer
# This needs 2 two functions, one to transform and another to inverse-transform

def target_transform(target):
  target_ = target.copy()
  target_ = np.sqrt(target_)
  return target_

def inverse_target_transform(target):
  target_ = target.copy()
  target_ = target_ ** 2
  return target_

# The purpose of this is to make the y label into a linear form
# Then make the prediction
# Then do the inverse to match the actual characteristics 

In [32]:
pipeline4 = Pipeline(steps=[
                              ('experimentalTransformer2', ExperimentalTransformer2('X2')),
                              ('linearModel', LinearRegression())
])
# The pipeline remains the same cause we doing the same thing for the input features

# Create a TargetTransformer
model = TransformedTargetRegressor(regressor=pipeline4,
                                   func=target_transform,
                                   inverse_func=inverse_target_transform)
model.fit(train_X, train_y)
preds4 = model.predict(test_X)
#Evaluate
print(f"{preds4}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds4))}")

Object initialized
Object initialized
Parameters Learned
Additional params: Default Value
Transformation Done
Transformation Done
[196. 289.]
RMSE : 4.0194366942304644e-14


Instead writing our own transformer and inverse transformer functions. We can simply create a class

In [38]:
class CustomTargetTransformer(BaseEstimator, TransformerMixin):
  
  def fit(self, target):
    return self

  def transform(sef, target): # These names cannot be changed
    print("Target transformer called")
    target_ = target.copy()
    target_ = np.sqrt(target_)
    return target_

  def inverse_transform(self, target):  # This names cannot be changed
    print("Inverse func called")
    target_ = target.copy()
    target_ = target_ ** 2
    return target_

In [39]:
pipeline5 = Pipeline(steps=[
                              ('experimentalTransformer2', ExperimentalTransformer2('X2')),
                              ('linearModel', LinearRegression())
])

model = TransformedTargetRegressor(regressor=pipeline5,
                                   transformer=CustomTargetTransformer(),
                                   check_inverse=False)  # Avoid too many calls
model.fit(train_X, train_y)
preds5 = model.predict(test_X)
#Evaluate
print(f"{preds5}")
print(f"RMSE : {np.sqrt(mean_squared_error(test_y, preds5))}")

Object initialized
Target transformer called
Object initialized
Parameters Learned
Additional params: Default Value
Transformation Done
Transformation Done
Inverse func called
[196. 289.]
RMSE : 4.0194366942304644e-14


Few best practices

In [40]:
# Since we inherited from the BaseEstimator, we can get_params() and set_params()
model.get_params()

{'check_inverse': False,
 'func': None,
 'inverse_func': None,
 'regressor': Pipeline(memory=None,
          steps=[('experimentalTransformer2',
                  ExperimentalTransformer2(additional_param='Default Value',
                                           feature_name='X2')),
                 ('linearModel',
                  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                   normalize=False))],
          verbose=False),
 'regressor__experimentalTransformer2': ExperimentalTransformer2(additional_param='Default Value', feature_name='X2'),
 'regressor__experimentalTransformer2__additional_param': 'Default Value',
 'regressor__experimentalTransformer2__feature_name': 'X2',
 'regressor__linearModel': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'regressor__linearModel__copy_X': True,
 'regressor__linearModel__fit_intercept': True,
 'regressor__linearModel__n_jobs': None,
 'regressor__linearModel__n