In [93]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, root_mean_squared_log_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import logging
from lightgbm import LGBMRegressor

# Configure logging
logging.basicConfig(level=logging.INFO)

In [42]:
train_data = pd.read_csv(filepath_or_buffer='data/train.csv', nrows=1000)
test_data = pd.read_csv(filepath_or_buffer='data/test.csv', nrows=1000)

In [53]:
def clean_feature_names(df):
    df = df.rename(columns=lambda x: x.lower().replace(' ', '_'))
    return df

In [58]:
predictors = [
    'age', 
    'gender', 
    'annual_income', 
    'marital_status',
    'number_of_dependents', 
    'education_level', 
    'occupation', 
    'health_score',
    'location', 
    'policy_type', 
    'previous_claims', 
    'vehicle_age',
    'credit_score', 
    'insurance_duration', 
    'policy_start_date',
    'customer_feedback', 
    'smoking_status', 
    'exercise_frequency',
    'property_type'
]

target = 'premium_amount'

In [60]:
X = train_data[predictors]
y = train_data[target]

y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=42)

In [82]:
def estimator_pipeline(estimator=estimator , numeric_features=None, ordinal_features=None, ordinal_categories=None, category_features=None):
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())   
    ])

    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(categories=[ordinal_categories['exercise_frequency']], handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    category_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric_features', numeric_transformer, numeric_features),
            ('ordinal_features', ordinal_transformer, ordinal_features),
            ('category_features', category_transformer, category_features)
        ])

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', estimator)
    ])
    
    return full_pipeline

In [83]:
# Features
numeric_features = [
    'age', 
    'annual_income', 
    'number_of_dependents', 
    'health_score',
    'previous_claims', 
    'vehicle_age', 
    'credit_score', 
    'insurance_duration'
]

category_features = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback',
    'gender',
    'smoking_status'
]

ordinal_features = ['exercise_frequency']

ordinal_categories = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}

date_features = ['policy_start_date']

In [89]:
# Estimator
estimator = GradientBoostingRegressor(loss='squared_error', random_state=0)

In [None]:
GradientBoostingRegressor_pipeline = estimator_pipeline(
    estimator=estimator, 
    numeric_features=numeric_features, 
    ordinal_features=ordinal_features, 
    ordinal_categories=ordinal_categories, 
    category_features=category_features
    )

GradientBoostingRegressor_pipeline = GradientBoostingRegressor_pipeline.fit(X_train, y_train)


Scores: [-749.36527509 -655.84647474 -634.1363926  -667.7209729  -710.86681771]
Avg Score: -683.5871866076837


In [94]:
# Create scorer
scorer = make_scorer(lambda y, y_pred: root_mean_squared_log_error(y, y_pred), greater_is_better=False)

In [95]:
scores = cross_val_score(GradientBoostingRegressor_pipeline, X, y, scoring=scorer, cv=5)
print(f'Scores: {scores}')
print(f'Avg Score: {scores.mean()}')

Scores: [-1.28069587 -1.14623133 -1.25134406 -1.24300994 -1.11204105]
Avg Score: -1.2066644504782917


In [90]:
GradientBoostingRegressor_pipeline.score(X_test, y_test)

-0.11155031009792515

In [None]:
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

# Create scorer using sklearn's metric
rmsle_scorer = make_scorer(lambda y, y_pred: np.sqrt(mean_squared_log_error(y, y_pred)), 
                          greater_is_better=False)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.8, 0.9, 1.0]
}

gb_model = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=gb_model,
    param_grid=param_grid,
    scoring=rmsle_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1
)