In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
import re
from sklearn.preprocessing import FunctionTransformer

#Mixed Variables
class TermTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['term'] = X_transformed['term'].str.replace(' months', '', regex=False).astype(float)
        return X_transformed

class IssueDTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['issue_d'] = pd.to_datetime(X_transformed['issue_d'], format='%b-%Y')
        X_transformed['issue_d'] = X_transformed['issue_d'].dt.strftime('%b')
        return X_transformed

class EmpLengthTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['emp_length'] = X_transformed['emp_length'].replace('10+ years', '10')
        X_transformed['emp_length'] = X_transformed['emp_length'].replace('< 1 year', '0')
        X_transformed['emp_length'] = X_transformed['emp_length'].astype(str).apply(lambda x: float(re.findall(r'\d+', x)[0]) if re.findall(r'\d+', x) else (x if x == 'nan' else None))
        X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
        return X_transformed

class EarliestCrLineTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['earliest_cr_line'] = pd.to_datetime(
            X_transformed['earliest_cr_line'], format='%b-%Y', errors='coerce'
        ).dt.year
        return X_transformed

#New Feature Extraction
class NewFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # X is expected to have 'earliest_cr_line' (numeric year), 'installment', 'annual_inc', 'loan_amnt'
        current_year = 2015

        cr_history = pd.Series(np.nan, index=X.index)
        if 'earliest_cr_line' in X.columns and pd.api.types.is_numeric_dtype(X['earliest_cr_line']):
            cr_history = current_year - X['earliest_cr_line']

        installment_to_income_ratio = (X['installment'] / X['annual_inc'])
        installment_to_income_ratio = installment_to_income_ratio.replace([np.inf, -np.inf], np.nan).fillna(0)

        loan_to_inc_ratio = (X['loan_amnt'] / X['annual_inc'])
        loan_to_inc_ratio = loan_to_inc_ratio.replace([np.inf, -np.inf], np.nan).fillna(0)

        # Return  DF with only the new features
        return pd.DataFrame({
            'cr_history': cr_history,
            'installment_to_income_ratio': installment_to_income_ratio,
            'loan_to_inc_ratio': loan_to_inc_ratio
        }, index=X.index)

class NewFeatureAddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.new_feature_generator = NewFeatureGenerator()
        self.column_name_map = {
            'earliest_cr_line': 'earliest_cr_line_pipeline__earliest_cr_line',
            'installment': 'installment_passthrough__installment',
            'annual_inc': 'annual_inc_pipeline__annual_inc',
            'loan_amnt': 'loan_amnt_pipeline__loan_amnt'
        }

    def fit(self, X, y=None):
        X_for_generation_fit = pd.DataFrame(index=X.index)
        for gen_col, pipeline_col in self.column_name_map.items():
            if pipeline_col in X.columns:
                X_for_generation_fit[gen_col] = X[pipeline_col]
            else:
                raise ValueError(f"NewFeatureAddingTransformer.fit: Expected column '{pipeline_col}' not found in input X.")

        self.new_feature_generator.fit(X_for_generation_fit, y)
        return self

    def transform(self, X, y=None):
        X_for_generation_transform = pd.DataFrame(index=X.index)
        for gen_col, pipeline_col in self.column_name_map.items():
            if pipeline_col in X.columns:
                X_for_generation_transform[gen_col] = X[pipeline_col]
            else:
                raise ValueError(f"NewFeatureAddingTransformer.transform: Expected column '{pipeline_col}' not found in input X.")

        new_features_df = self.new_feature_generator.transform(X_for_generation_transform)

        # Concatenate new features with the existing DataFrame X (which contains all processed original columns)
        Xt_with_new_features = pd.concat([X, new_features_df], axis=1)
        return Xt_with_new_features

#Outlier
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, capping_thresholds=None):
        if capping_thresholds is None:
            self.capping_thresholds = {
                'annual_inc': 260000.0,
                'dti': 100.0,
                'bc_util': 100.0,
                'revol_util': 100.0
            }
        else:
            self.capping_thresholds = capping_thresholds

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for col, threshold in self.capping_thresholds.items():
            if col in X_transformed.columns:
                #winsorization (capping) for upper outliers
                X_transformed[col] = np.where(X_transformed[col] > threshold, threshold, X_transformed[col])
        return X_transformed

#Binning
def bin_pub_rec(X_df):
    series = X_df.iloc[:, 0]
    bins = [-0.1, 0.9, 2.9, 5.9, np.inf] # Bins for 0, 1-2, 3-5, >5
    labels = [0,1,2,3]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

def bin_emp_length(X_df):
    series = X_df.iloc[:, 0]
    bins = [-0.1, 3, 9, np.inf]
    labels = [0,1,2]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

def bin_delinq_2yrs(X_df):
    series = X_df.iloc[:, 0]
    bins = [-0.1, 3, 11, 19, np.inf] # Bins for 0-3, 4-11, 12-19, >19
    labels = [0,1,2,3]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

def bin_fico_range_low(X_df):
    series = X_df.iloc[:, 0]
    bins = [-1,649, 699,749, np.inf]
    labels = [0,1,2,3]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

#Binarizer
def binarize_revol_util(X_df):
    series = X_df.iloc[:, 0]
    return (series > 75).astype(int).to_frame(name=X_df.columns[0])

def binarize_bc_util(X_df):
    series = X_df.iloc[:, 0]
    return (series > 80).astype(int).to_frame(name=X_df.columns[0])

#Mathematical transformations
def apply_log1p_df(X_df):
    return pd.DataFrame(np.log1p(X_df.iloc[:,0]), index=X_df.index, columns=X_df.columns)

#Categorical Column transformations
def transform_home_ownership(X_df):
    series = X_df.iloc[:, 0]
    valid_ownership = ['MORTGAGE', 'RENT', 'OWN']
    return series.apply(lambda x: x if x in valid_ownership else 'Other').to_frame(name=X_df.columns[0])

def transform_purpose(X_df):
    series = X_df.iloc[:, 0]
    valid_purpose = ['debt_consolidation', 'credit_card']
    return series.apply(lambda x: x if x in valid_purpose else 'Other').to_frame(name=X_df.columns[0])

#cols to drop
class ColumnDroppingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cols_to_actually_drop = [col for col in self.columns_to_drop if col in X.columns]
        return X.drop(columns=cols_to_actually_drop)

#1.Read Dataset and Load Preprocessors

In [2]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
df=pd.read_csv('df_regression.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,purpose,dti,delinq_2yrs,earliest_cr_line,fico_range_low,pub_rec,revol_util,application_type,bc_util,loss_amnt
0,18000.0,60 months,19.48,471.7,E,E2,7 years,RENT,150000.0,Not Verified,...,debt_consolidation,9.39,0.0,Jun-2005,665.0,1.0,40.7,Individual,51.3,16381.1
1,4225.0,36 months,14.85,146.16,C,C5,5 years,RENT,35000.0,Source Verified,...,debt_consolidation,15.22,2.0,Jul-2011,725.0,0.0,24.6,Individual,14.1,3856.63
2,16000.0,36 months,12.88,538.18,C,C2,10+ years,MORTGAGE,65000.0,Not Verified,...,small_business,18.96,0.0,Dec-1985,675.0,1.0,54.3,Individual,21.6,16000.0
3,24250.0,60 months,24.24,701.01,F,F3,4 years,MORTGAGE,75000.0,Not Verified,...,debt_consolidation,20.84,0.0,Apr-2007,660.0,0.0,65.3,Individual,68.9,24250.0
4,25000.0,60 months,13.99,581.58,C,C4,9 years,MORTGAGE,79000.0,Not Verified,...,debt_consolidation,34.53,0.0,Jun-2001,730.0,0.0,18.6,Individual,21.4,25000.0


In [4]:
df.shape

(54156, 21)

In [5]:
X=df.drop(['loss_amnt','application_type','int_rate','grade','sub_grade'],axis=1)
y=df['loss_amnt']

In [6]:
import joblib
linear_preprocessor=joblib.load('reg_linear_based_preprocessor.pkl')
tree_preprocessor=joblib.load('reg_tree_based_preprocessor.pkl')

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
y_train = y_train.apply(lambda x: np.log1p(np.maximum(0, x)))

In [9]:
df_for_meta_test=pd.DataFrame()

#2.Reference Model Verification

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
ref_lr=LinearRegression()
ref_lr

In [11]:
ref_lr_pipe=Pipeline([
    ('preprocessor',linear_preprocessor),
    ('model',ref_lr)
])

In [12]:
ref_lr_pipe.fit(X_train,y_train)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 4) Processing initial_feature_processing, total=   0.4s
[Pipeline]  (step 2 of 4) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 4) Processing column_dropping, total=   0.0s
[Pipeline] ........... (step 4 of 4) Processing scaling, total=   0.0s


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
y_pred_lr=ref_lr_pipe.predict(X_test)
y_pred_lr=np.expm1(y_pred_lr)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [14]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error
mse = mean_squared_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)
rmse = root_mean_squared_error(y_test, y_pred_lr)
mae=mean_absolute_error(y_test, y_pred_lr)

In [15]:
print(mse,r2,rmse)
#r2_score near to 1 which is unacceptable too much

3421992.6821371005 0.9447276717816897 1849.8628819826351


In [16]:
rmse/mae #too many large errors

1.7259596685189096

In [17]:
np.mean(y_test)

np.float64(14350.18976955462)

In [18]:
np.var(y_test)

61911498.799564704

#3.Baseline Model Building

##**1.RandomForest**

In [19]:
from sklearn.ensemble import RandomForestRegressor
rf_base=RandomForestRegressor(max_depth=10)
rf_base

In [20]:
rfb_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('model', rf_base)]
)

In [21]:
rfb_pipe.fit(X_train,y_train)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


In [22]:
y_pred_rfb = rfb_pipe.predict(X_test)
# y_pred_rfb=np.expm1(y_pred_rfb)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [23]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error
mse = mean_squared_error(y_test, y_pred_rfb)
r2 = r2_score(y_test, y_pred_rfb)
rmse = root_mean_squared_error(y_test, y_pred_rfb)
mae=mean_absolute_error(y_test, y_pred_rfb)

In [24]:
print(mse,r2,rmse)
#r2_score near to 1 which is unacceptable too much

267561041.12822232 -3.3216695818402737 16357.29320909246


In [25]:
rmse/mae #too many large errors

1.1405916100714097

In [26]:
df_for_meta_test['RandomForest']=y_pred_rfb

##**2.xgboost**

In [27]:
import xgboost as xgb
xgb_base=xgb.XGBRegressor()
xgb_base

In [28]:
xgb_pipe = Pipeline(
    [('preprocessor', tree_preprocessor),
     ('model', xgb_base)]
)

In [29]:
xgb_pipe.fit(X_train,y_train)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


In [30]:
y_pred_xgb=xgb_pipe.predict(X_test)
# y_pred_xgb=np.expm1(y_pred_xgb)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [31]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error
mse = mean_squared_error(y_test, y_pred_xgb)
r2 = r2_score(y_test, y_pred_xgb)
rmse = root_mean_squared_error(y_test, y_pred_xgb)
mae=mean_absolute_error(y_test, y_pred_xgb)

In [32]:
print(mse,r2,rmse)
#r2_score near to 1 which is unacceptable too much

267561104.50921103 -3.321670605575604 16357.295146484672


In [33]:
rmse/mae #too many large errors

1.1405916170733859

In [34]:
df_for_meta_test['XGBoost']=y_pred_xgb

##**3.LightGBM**

In [35]:
import lightgbm as lgb
lgb_base = lgb.LGBMRegressor()
lgb_base

In [36]:
lgb_pipe = Pipeline(
    [('preprocessor', tree_preprocessor),
     ('model', lgb_base)]
)

In [37]:
lgb_pipe.fit(X_train, y_train)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 40617, number of used features: 20
[LightGBM] [Info] Start training from score 9.395932


In [38]:
y_pred_lgbm=xgb_pipe.predict(X_test)
# y_pred_lgbm=np.expm1(y_pred_lgbm)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [39]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error
mse = mean_squared_error(y_test, y_pred_lgbm)
r2 = r2_score(y_test, y_pred_lgbm)
rmse = root_mean_squared_error(y_test, y_pred_lgbm)
mae=mean_absolute_error(y_test, y_pred_lgbm)

In [40]:
print(mse,r2,rmse)
#r2_score near to 1 which is unacceptable too much

267561104.50921103 -3.321670605575604 16357.295146484672


In [41]:
rmse/mae #too many large errors

1.1405916170733859

In [42]:
df_for_meta_test['LightGBM']=y_pred_lgbm

##**4.GradientBoosting**

In [43]:
from sklearn.ensemble import GradientBoostingRegressor
gb_base=GradientBoostingRegressor()
gb_base

In [44]:
gb_pipe=Pipeline([
    ('preprocessor',tree_preprocessor),
    ('model',gb_base)
])

In [45]:
gb_pipe.fit(X_train,y_train)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.6s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


In [46]:
y_pred_gb=gb_pipe.predict(X_test)
# y_pred_gb=np.expm1(y_pred_gb)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [47]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error
mse = mean_squared_error(y_test, y_pred_gb)
r2 = r2_score(y_test, y_pred_gb)
rmse = root_mean_squared_error(y_test, y_pred_gb)
mae=mean_absolute_error(y_test, y_pred_gb)

In [48]:
print(mse,r2,rmse)
#r2_score near to 1 which is unacceptable too much

267560982.908699 -3.321668641473388 16357.291429472638


In [49]:
rmse/mae #too many large errors

1.140591725221152

In [50]:
df_for_meta_test['GradBoost']=y_pred_gb

#4.OOF Predictions

In [51]:
from sklearn.model_selection import KFold
from sklearn.base import clone
from copy import deepcopy
skf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'RandomForest': rfb_pipe,
    'XGBoost': xgb_pipe,
    'LightGBM': lgb_pipe,
    'GradBoost': gb_pipe
}

oof_preds = {}
oof_labels = {}

for model_name, pipeline in models.items():
    print(f"Generating OOF predictions for {model_name}...")
    model_oof_preds = []
    model_oof_labels = []

    for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        #Pipeline Copy to prevent data leakage
        pipeline_clone = deepcopy(pipeline)

        pipeline_clone.fit(X_train_fold, y_train_fold)
        y_pred_proba_fold = pipeline_clone.predict(X_val_fold)

        model_oof_preds.append(y_pred_proba_fold)
        model_oof_labels.append(y_val_fold)

    oof_preds[model_name] = np.concatenate(model_oof_preds)
    oof_labels[model_name] = np.concatenate(model_oof_labels)

print("OOF prediction generation complete.")

Generating OOF predictions for RandomForest...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.5s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.6s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


Generating OOF predictions for XGBoost...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


Generating OOF predictions for LightGBM...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 32493, number of used features: 20
[LightGBM] [Info] Start training from score 9.395275


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 32493, number of used features: 20
[LightGBM] [Info] Start training from score 9.396802


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1366
[LightGBM] [Info] Number of data points in the train set: 32494, number of used features: 20
[LightGBM] [Info] Start training from score 9.391767


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1368
[LightGBM] [Info] Number of data points in the train set: 32494, number of used features: 20
[LightGBM] [Info] Start training from score 9.399088


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002776 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 32494, number of used features: 20
[LightGBM] [Info] Start training from score 9.396729


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


Generating OOF predictions for GradBoost...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.6s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.3s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
OOF prediction generation complete.


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [52]:
oof_preds_df = pd.DataFrame(oof_preds)
oof_preds_df['target']=oof_labels['RandomForest']
oof_preds_df.head()

Unnamed: 0,RandomForest,XGBoost,LightGBM,GradBoost,target
0,9.031111,9.050202,8.990135,9.032191,8.991313
1,9.528915,9.526147,9.522388,9.506407,9.52811
2,9.822699,9.84937,9.829571,9.819034,9.903538
3,8.452602,8.47493,8.44124,8.459613,8.517393
4,8.426689,8.349952,8.428644,8.428089,8.458218


#5.Meta Model

In [53]:
from sklearn.linear_model import Ridge
meta_model=Ridge()
meta_model

In [54]:
meta_model.fit(oof_preds_df.drop('target',axis=1),oof_preds_df['target'])

In [55]:
oof_preds_df

Unnamed: 0,RandomForest,XGBoost,LightGBM,GradBoost,target
0,9.031111,9.050202,8.990135,9.032191,8.991313
1,9.528915,9.526147,9.522388,9.506407,9.528110
2,9.822699,9.849370,9.829571,9.819034,9.903538
3,8.452602,8.474930,8.441240,8.459613,8.517393
4,8.426689,8.349952,8.428644,8.428089,8.458218
...,...,...,...,...,...
40612,9.402371,9.410841,9.413379,9.408774,9.488048
40613,10.035600,10.019372,10.051626,10.036040,10.092702
40614,10.001240,9.967278,9.988316,9.976732,10.079581
40615,9.197773,9.177827,9.187596,9.205031,9.207470


In [56]:
y_pred_meta=meta_model.predict(df_for_meta_test)
y_pred_meta=np.expm1(y_pred_meta)

In [57]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error,median_absolute_error,mean_absolute_percentage_error
mse = mean_squared_error(y_test, y_pred_meta)
r2 = r2_score(y_test, y_pred_meta)
rmse = root_mean_squared_error(y_test, y_pred_meta)
mae=mean_absolute_error(y_test, y_pred_meta)
medae=median_absolute_error(y_test,y_pred_meta)
mape=mean_absolute_percentage_error(y_test,y_pred_meta)
ratio=rmse/mae

In [58]:
print(f"MSE: {mse}")
print(f"R2: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MedAE: {medae}")
print(f"MAPE: {mape}")
print(f"Ratio: {ratio}")

MSE: 3707716.6228144593
R2: 0.9401126334411971
RMSE: 1925.5432020119567
MAE: 1113.116796226611
MedAE: 635.1368945785653
MAPE: 0.11343679815939846
Ratio: 1.7298662714814967


In [59]:
print(f"mean of y_test: {np.mean(y_test)}")
print(f"mean of y_pred_meta: {np.mean(y_pred_meta)}")

mean of y_test: 14350.18976955462
mean of y_pred_meta: 14194.906062855895


#6.Inference Pipeline

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [61]:
class RegPredictProbaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        pred = self.estimator.predict(X)
        if isinstance(pred, (pd.Series, pd.DataFrame)):
            pred = pred.values
        return np.asarray(pred).reshape(-1, 1)

In [62]:
from sklearn.pipeline import FeatureUnion
rf_transformer = RegPredictProbaTransformer(estimator=rfb_pipe)
xgb_transformer = RegPredictProbaTransformer(estimator=xgb_pipe)
lgb_transformer = RegPredictProbaTransformer(estimator=lgb_pipe)
gb_transformer = RegPredictProbaTransformer(estimator=gb_pipe)
feature_union = FeatureUnion([
    ('rf_proba', rf_transformer),
    ('xgb_proba', xgb_transformer),
    ('lgb_proba', lgb_transformer),
    ('gb_proba', gb_transformer)
])
final_inference_pipeline = Pipeline([
    ('base_model_predictions', feature_union),
    ('meta_model', meta_model)
])

In [63]:
y_pred_final=final_inference_pipeline.predict(X_test)
y_pred_final=np.expm1(y_pred_final)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [64]:
from sklearn.metrics import mean_squared_error,r2_score,root_mean_squared_error,mean_absolute_error,median_absolute_error,mean_absolute_percentage_error
mse = mean_squared_error(y_test, y_pred_final)
r2 = r2_score(y_test, y_pred_final)
rmse = root_mean_squared_error(y_test, y_pred_final)
mae=mean_absolute_error(y_test, y_pred_final)
medae=median_absolute_error(y_test,y_pred_final)
mape=mean_absolute_percentage_error(y_test,y_pred_final)
ratio=rmse/mae

In [65]:
print(f"MSE: {mse}")
print(f"R2: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MedAE: {medae}")
print(f"MAPE: {mape}")
print(f"Ratio: {ratio}")

MSE: 3523366.3398608062
R2: 0.9430902755033037
RMSE: 1877.0632221267365
MAE: 1090.908599431991
MedAE: 630.6494178340718
MAPE: 0.1132745932692966
Ratio: 1.7206420621343317


#7.Saving the pipeline

In [66]:
import joblib
joblib.dump(final_inference_pipeline, 'reg_final_inference_pipeline.pkl')

['reg_final_inference_pipeline.pkl']