In [162]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, root_mean_squared_log_error
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd
import logging


# Configure logging
logging.basicConfig(level=logging.INFO)

In [132]:
train_data = pd.read_csv(filepath_or_buffer='data/train.csv', nrows=1000)

In [133]:
def clean_feature_names(df):
    df.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)
    return df

In [134]:
clean_feature_names(train_data)

Unnamed: 0,id,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,...,previous_claims,vehicle_age,credit_score,insurance_duration,policy_start_date,customer_feedback,smoking_status,exercise_frequency,property_type,premium_amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,42.0,Male,45638.0,Single,2.0,PhD,Employed,50.865307,Urban,...,,12.0,776.0,1.0,2019-12-11 15:21:39.085129,Good,No,Rarely,Apartment,1806.0
996,996,64.0,Female,14094.0,Married,1.0,High School,Employed,29.351739,Urban,...,2.0,16.0,818.0,4.0,2021-10-02 15:21:39.190215,Average,Yes,Rarely,House,1116.0
997,997,,Female,80198.0,Divorced,1.0,Bachelor's,Self-Employed,29.359427,Suburban,...,2.0,4.0,775.0,4.0,2021-11-09 15:21:39.174719,Good,Yes,Rarely,Apartment,3381.0
998,998,25.0,Male,16981.0,Divorced,1.0,High School,Self-Employed,32.896941,Suburban,...,0.0,5.0,673.0,7.0,2020-05-03 15:21:39.097737,Poor,Yes,Rarely,Condo,718.0


In [135]:
predictors = [
    'age', 
    'gender', 
    'annual_income', 
    'marital_status',
    'number_of_dependents', 
    'education_level', 
    'occupation', 
    'health_score',
    'location', 
    'policy_type', 
    'previous_claims', 
    'vehicle_age',
    'credit_score', 
    'insurance_duration', 
    'policy_start_date',
    'customer_feedback', 
    'smoking_status', 
    'exercise_frequency',
    'property_type'
]

target = 'premium_amount'

In [136]:
X = train_data[predictors]
y = train_data[target]

y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=42)

In [137]:
def estimator_pipeline(estimator=estimator , numeric_features=None, ordinal_features=None, ordinal_categories=None, category_features=None):
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())   
    ])

    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(categories=[ordinal_categories['exercise_frequency']], handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    category_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric_features', numeric_transformer, numeric_features),
            ('ordinal_features', ordinal_transformer, ordinal_features),
            ('category_features', category_transformer, category_features)
        ])

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('estimator', estimator)
    ])
    
    return full_pipeline

In [138]:
# Features
numeric_features = [
    'age', 
    'annual_income', 
    'number_of_dependents', 
    'health_score',
    'previous_claims', 
    'vehicle_age', 
    'credit_score', 
    'insurance_duration'
]

category_features = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback',
    'gender',
    'smoking_status'
]

ordinal_features = ['exercise_frequency']

ordinal_categories = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}

date_features = ['policy_start_date']

In [139]:
# Estimator
estimator = GradientBoostingRegressor()

In [140]:
GradientBoostingRegressor_pipeline = estimator_pipeline(
    estimator=estimator, 
    numeric_features=numeric_features, 
    ordinal_features=ordinal_features, 
    ordinal_categories=ordinal_categories, 
    category_features=category_features
    )

GradientBoostingRegressor_pipeline = GradientBoostingRegressor_pipeline.fit(X_train, y_train)


In [141]:
# Create scorer
scorer = make_scorer(lambda y, y_pred: root_mean_squared_log_error(y, np.expm1(y_pred)), greater_is_better=False)

In [142]:
scores = cross_val_score(GradientBoostingRegressor_pipeline, X, y, scoring=scorer, cv=5)
print(f'Scores: {scores}')
print(f'Avg Score: {scores.mean()}')

  scorer = make_scorer(lambda y, y_pred: root_mean_squared_log_error(y, np.expm1(y_pred)), greater_is_better=False)
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/vscode/.local/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1004/1396724387.py", line 2, in <lambda>
    scorer = make_scorer(lambda y, y_pred: root_mean_squared_log_error(y, np.expm1(y_pred)), greater_is_better=False)
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vscode/.local/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
         

Scores: [nan nan nan nan nan]
Avg Score: nan


  scorer = make_scorer(lambda y, y_pred: root_mean_squared_log_error(y, np.expm1(y_pred)), greater_is_better=False)
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/vscode/.local/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1004/1396724387.py", line 2, in <lambda>
    scorer = make_scorer(lambda y, y_pred: root_mean_squared_log_error(y, np.expm1(y_pred)), greater_is_better=False)
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vscode/.local/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
         

In [147]:
param_grid = {
    # 'estimator__n_estimators': [100, 200, 300],
    'estimator__learning_rate': [0.05, 0.1],
    'estimator__max_depth': [3, 4, 5],
    'estimator__min_samples_split': [2, 5, 10],
    # 'estimator__subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(
    estimator=GradientBoostingRegressor_pipeline,
    param_grid=param_grid,
    scoring=scorer,
    cv=2,
    n_jobs=-1,
    verbose=1
)

In [148]:
# Fit the grid search
grid_search.fit(X_train, y_train)

# Get best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", abs(grid_search.best_score_))

# Use best model
best_model = grid_search.best_estimator_

Fitting 2 folds for each of 18 candidates, totalling 36 fits
Best parameters: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 5, 'estimator__min_samples_split': 5}
Best score: 4.550737211783817


In [149]:
test_data = pd.read_csv(filepath_or_buffer='data/test.csv', nrows=1000)

In [151]:
clean_feature_names(test_data)

Unnamed: 0,id,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,policy_type,previous_claims,vehicle_age,credit_score,insurance_duration,policy_start_date,customer_feedback,smoking_status,exercise_frequency,property_type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1200995,36.0,Male,53985.0,Divorced,2.0,PhD,Self-Employed,12.243040,Rural,Comprehensive,,9.0,329.0,2.0,2022-05-13 15:21:39.237118,Average,No,Weekly,Condo
996,1200996,28.0,Male,15456.0,Single,2.0,High School,,25.753524,Urban,Premium,0.0,18.0,785.0,2.0,2020-04-14 15:21:39.084065,Good,Yes,Monthly,Condo
997,1200997,52.0,Female,17992.0,Married,3.0,Master's,Unemployed,14.638429,Rural,Comprehensive,0.0,6.0,718.0,9.0,2022-02-28 15:21:39.185215,Good,Yes,Rarely,House
998,1200998,22.0,Male,4034.0,Divorced,0.0,Bachelor's,Employed,39.518849,Suburban,Comprehensive,3.0,1.0,697.0,7.0,2023-09-16 15:21:39.118178,Good,Yes,Rarely,House


In [156]:
# Make predictions
y_pred = best_model.predict(test_data)
y_pred = np.expm1(y_pred)

In [158]:
sample_submission = pd.read_csv(filepath_or_buffer='data/sample_submission.csv', nrows=1000)

In [161]:
sample_submission['Premium Amount'] = y_pred
sample_submission.to_csv('submission.csv', index=False)