# ML Pipeline

## Import Packages

In [1]:
import pandas as pd
import joblib
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Define Helper Functions for Custom Steps

In [2]:
# Box-Cox transform Annual Income
def boxcox_transform(X, l = 0.5):
    X = X.copy()
    X[:, 1] = stats.boxcox(X[:, 1], lmbda=l)
    return X

## Define Column Groups (same as your manual code)

In [3]:
# Numeric columns
num_cols = [
    "Age", "Annual Income", "Number of Dependents",
    "Health Score", "Previous Claims", "Vehicle Age",
    "Insurance Duration", "Credit Score"
]

# Label encoded categorical columns
enc_cols = ['Gender','Marital Status','Education Level','Occupation','Location','Policy Type',
            'Smoking Status','Exercise Frequency','Property Type']

## Create Pipelines for Each Column Type

In [4]:
# Numeric pipeline → median impute + scale
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('boxcox', FunctionTransformer(boxcox_transform, validate=False))
])

# Label pipeline → mode impute + basic encoding (OrdinalEncoder behaves like LabelEncoder)
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

## Combine All Preprocessing with ColumnTransformer

In [5]:
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', cat_pipeline, enc_cols)
])

In [6]:
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function box...002676BED6290>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


## Preprossing Pipeline

In [7]:
preprossing_pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('scaler', StandardScaler())
])

In [8]:
preprossing_pipeline

0,1,2
,steps,"[('preprocessing', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function box...002676BED6290>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


## Upload Data into Pipeline

In [9]:
train_data = pd.read_csv(r"N:\GUVI\Smart_Premium\Dataset\train.csv", index_col="id")

# Drop unnecessary columns
train_data.drop(["Customer Feedback","Policy Start Date"], axis=1,inplace=True)

x= train_data.drop("Premium Amount", axis=1)
y = train_data["Premium Amount"]

In [10]:
# Transform features
x_prepared = preprossing_pipeline.fit_transform(x)
x_prepared

array([[-1.64830057e+00, -6.96263059e-01, -7.46861703e-01, ...,
        -1.00312655e+00,  1.32826067e+00,  1.22411616e+00],
       [-1.59542341e-01,  2.24613083e-01,  7.33500203e-01, ...,
         9.96883191e-01, -4.59846873e-01,  1.22411616e+00],
       [-1.35054892e+00,  1.16618955e-02,  7.33500203e-01, ...,
         9.96883191e-01,  1.32826067e+00,  1.22411616e+00],
       ...,
       [-1.64830057e+00,  8.14505055e-01, -1.48704266e+00, ...,
        -1.00312655e+00, -4.59846873e-01, -3.78572954e-04],
       [ 1.03146424e+00, -5.20026600e-02, -7.46861703e-01, ...,
        -1.00312655e+00, -1.35390065e+00, -1.22487330e+00],
       [-1.49942475e+00, -5.20026600e-02, -1.48704266e+00, ...,
         9.96883191e-01, -4.59846873e-01,  1.22411616e+00]],
      shape=(1200000, 17))

In [11]:
# Save the fitted pipeline
joblib.dump(preprossing_pipeline, 'preprocessing_pipeline.pkl')

['preprocessing_pipeline.pkl']

## Split Data for Train and Test

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_prepared,y, test_size=0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((960000, 17), (240000, 17), (960000,), (240000,))

# Model Deployment with MLFlow

In [13]:
import mlflow
import dagshub

In [14]:
dagshub.init(repo_owner='nithis127', repo_name='Smart_Premium', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=12aa2a66-244e-4890-8ea3-763091fd031e&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=e5628e6b0c3ddbb157be9c3dbda17c2ae17b4c6eeef8e0de9b033d88bfb52ca9




In [15]:
mlflow.set_tracking_uri("https://dagshub.com/nithis127/Smart_Premium.mlflow")

## First Experiment

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.metrics import mean_absolute_error,root_mean_squared_error, r2_score

In [22]:
models = [
    (
        "Linear Regression",
        {"fit_intercept" : True, "positive" : False},
        LinearRegression(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    (
        "Decision Tree Regressor",
        {"min_samples_split" : 0.01, "max_depth" : 15, "random_state" : 42},
        DecisionTreeRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    (
        "Random Forest Regressor",
        {"n_estimators" : 30, "max_depth" : 12, "random_state" : 42},
        RandomForestRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    (
        "XGB Regressor",
        {"n_estimators" : 30, "max_depth" : 5, "random_state" : 42},
        XGBRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    )
]

In [23]:
reports = []

for model_name, params, model, train_set, test_set in models:
    x_train = train_set[0]
    y_train = train_set[1]
    x_test = test_set[0]
    y_test = test_set[1]

    # apply hyperparameters and train the model
    model.set_params(**params)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    # calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # store the results
    reports.append((model_name, rmse, mae, r2))

In [24]:
reports

[('Linear Regression',
  865.243529278858,
  668.8954476263739,
  0.0028408775694358512),
 ('Decision Tree Regressor',
  849.8307633576054,
  645.946026544201,
  0.03804968105311124),
 ('Random Forest Regressor',
  848.9637258104648,
  644.7979919311817,
  0.040011534198063914),
 ('XGB Regressor', 850.2042021420918, 650.2329519114177, 0.03720408096198158)]

In [25]:
mlflow.set_experiment("Expeiriment_1")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name = model_name):
        mlflow.log_params(params)
        mlflow.log_metrics({"RMSE" : report[1],
                            "MAE" : report[2],
                            "R2" : report[3]
                            })
        
        if "XGB Regressor" in model_name:
            mlflow.xgboost.log_model(model, "xgboost")
        else:
            mlflow.sklearn.log_model(model, "sklearn")

2025/11/18 20:26:51 INFO mlflow.tracking.fluent: Experiment with name 'Expeiriment_1' does not exist. Creating a new experiment.


🏃 View run Linear Regression at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0/runs/891185e8c4904476ad25cbb1fbe828a4
🧪 View experiment at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0




🏃 View run Decision Tree Regressor at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0/runs/dd17f0c8a79347ff932fa4fa25d8fd28
🧪 View experiment at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0




🏃 View run Random Forest Regressor at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0/runs/01197c9225c642408122fb337f47e6ed
🧪 View experiment at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0


  self.get_booster().save_model(fname)


🏃 View run XGB Regressor at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0/runs/b2a42196f46943d3a6693bde23138e9c
🧪 View experiment at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0


## Model Registration

In [26]:
model_name = "sp_rfr"
run_id = input("Enter RunID: ")
model_uri = f"runs:/{run_id}/sklearn"

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri = model_uri, name = model_name)

model_uri

Successfully registered model 'sp_rfr'.
2025/11/18 20:30:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sp_rfr, version 1
Created version '1' of model 'sp_rfr'.


🏃 View run Random Forest Regressor at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0/runs/01197c9225c642408122fb337f47e6ed
🧪 View experiment at: https://dagshub.com/nithis127/Smart_Premium.mlflow/#/experiments/0


'runs:/01197c9225c642408122fb337f47e6ed/sklearn'

## Transition the Model to the Production

In [27]:
client = mlflow.MlflowClient()

model_name = "sp_rfr"
version_to_promote = 1

# Transition Version 1 to the "Production" stage
client.transition_model_version_stage(
    name=model_name,
    version=version_to_promote,
    stage="Production",
    archive_existing_versions=True  # this will archive other versions in Production stage, if any
)

print(f"Version {version_to_promote} of model '{model_name}' is now in Production stage.")

  client.transition_model_version_stage(


Version 1 of model 'sp_rfr' is now in Production stage.


## Save Production Model

In [29]:
# Load the underlying model
production_model = mlflow.sklearn.load_model('models:/sp_rfr/Production')

# Save as joblib pickle
joblib.dump(production_model, 'smart_premium_model.pkl')

print("✅ production Model saved as smart_premium_model.pkl")

✅ production Model saved as smart_premium_model.pkl
