In [1]:
from sklearn.linear_model import TweedieRegressor
import pandas as pd

In [3]:
import sklearn
print(sklearn.__version__)

1.5.2


In [4]:
import pandas as pd
from functools import partial
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_tweedie_deviance,
)

def score_estimator(
        estimator,
        x_train,
        x_test,
        y_train,
        y_test,
        target,
        weights,
        tweedie_powers=None,
):
    """Evaluate an estimator on train and test sets with different metrics"""

    metrics = [
        ('D² explained', None),  # Use default scorer if it exists
        ('mean abs. error', mean_absolute_error),
        ('mean squared error', mean_squared_error),
    ]
    if tweedie_powers:
        metrics += [
            (
                'mean Tweedie dev p={:.4f}'.format(power),
                partial(mean_tweedie_deviance, power=power),
            )
            for power in tweedie_powers
        ]
    res = []
    for subset_label, x_df, y_df in [
        ('train', x_train, y_train),
        ('test', x_test, y_test),
    ]:
        y, _weights = y_df[target], y_df[weights]
        for score_label, metric in metrics:
            if isinstance(estimator, tuple) and len(estimator) == 2:
                # Score the model consisting of the product of frequency and
                # severity models.
                est_freq, est_sev = estimator
                y_pred = est_freq.predict(x_df) * est_sev.predict(x_df)
            else:
                y_pred = estimator.predict(x_df)

            if metric is None:
                if not hasattr(estimator, 'score'):
                    continue
                print('x_df shape: {}, y_df shape: {}'.format(x_df.shape, y_df.shape))
                score = estimator.score(x_df, y, sample_weight=_weights)
            else:
                score = metric(y, y_pred, sample_weight=_weights)
            
            res.append({'subset': subset_label, 'metric': score_label, 'score': score})

    res = (
        pd.DataFrame(res)
        .set_index(['metric', 'subset'])
        .score.unstack(-1)
        .round(4)
        .loc[:, ['train', 'test']]
    )

    return res

In [5]:
import pandas as pd
from functools import partial
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_tweedie_deviance,
)

def score_estimator_tf(
        x_train,
        x_test,
        y_train,
        y_test,
        y_train_pred,
        y_test_pred,
        target,
        weights,
        tweedie_powers=None,
):
    """Evaluate an estimator on train and test sets with different metrics"""

    metrics = [
        ('D² explained', None),  # Use default scorer if it exists
        ('mean abs. error', mean_absolute_error),
        ('mean squared error', mean_squared_error),
    ]
    if tweedie_powers:
        metrics += [
            (
                'mean Tweedie dev p={:.4f}'.format(power),
                partial(mean_tweedie_deviance, power=power),
            )
            for power in tweedie_powers
        ]
    res = []
    for subset_label, x_df, y_df, y_pred in [
        ('train', x_train, y_train, y_train_pred),
        ('test', x_test, y_test, y_test_pred),
    ]:
        y, _weights = y_df[target], y_df[weights]
        for score_label, metric in metrics:
            score = metric(y, y_pred, sample_weight=_weights)
            res.append({'subset': subset_label, 'metric': score_label, 'score': score})

    res = (
        pd.DataFrame(res)
        .set_index(['metric', 'subset'])
        .score.unstack(-1)
        .round(4)
        .loc[:, ['train', 'test']]
    )

    return res

In [6]:
def train_test_split_fed(df_y, df_x, train_count = 10000):
    y_train = df_y[0:train_count]
    y_test = df_y[train_count+1:]
    x_train = df_x[0:train_count]
    x_test = df_x[train_count+1:]

    return y_train, y_test, x_train, x_test

In [7]:
df = pd.read_csv('data/insurance_claims_label_feature_fed.csv')

print('df shape: {}'.format(df.shape))
df = df[df['ClaimAmount'] > 0]
df.head()

df shape: (24944, 94)


Unnamed: 0,raw_id,example_id,event_date,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,...,X65,X66,X67,X68,X69,X70,X71,X72,X73,X74
0,1,1,20240901,1.0,0.75,F,7.0,1.0,61.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,2.257113
1,2,2,20240901,1.0,0.14,B,12.0,5.0,50.0,60.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,-1.045961
2,3,3,20240901,1.0,0.14,E,4.0,0.0,36.0,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,85.0,1.332797
3,4,4,20240901,2.0,0.62,F,10.0,0.0,51.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0,2.257113
4,5,5,20240901,1.0,0.31,A,5.0,0.0,45.0,50.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,50.0,-1.869529


In [8]:
columns_x = ['X{}'.format(idx) for idx in range(75)]
df_x = df[columns_x]

columns_y = ['PurePremium', 'Exposure']
df_y = df[columns_y]

print('df_x shape: {}'.format(df_x.shape))
df_x.head()

df_x shape: (24944, 75)


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X65,X66,X67,X68,X69,X70,X71,X72,X73,X74
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,2.257113
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,-1.045961
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,85.0,1.332797
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0,2.257113
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,50.0,-1.869529


In [9]:
# from sklearn.model_selection import train_test_split

# y_train, y_test, x_train, x_test = train_test_split(df_y, df_x, random_state=0)
y_train, y_test, x_train, x_test = train_test_split_fed(df_y, df_x, train_count=10000)

print('y_train shape: {}, \t y_test shape: {}, \n x_train shape: {}, \t x_test shape: {}'
      .format(y_train.shape, y_test.shape, x_train.shape, x_test.shape))

y_train shape: (10000, 2), 	 y_test shape: (14943, 2), 
 x_train shape: (10000, 75), 	 x_test shape: (14943, 75)


In [10]:
from sklearn.linear_model import TweedieRegressor

glm_pure_premium = TweedieRegressor(power=1.5, alpha=0.1, solver='newton-cholesky')
glm_pure_premium.fit(X=x_train, y=y_train['PurePremium'], sample_weight=y_train['Exposure'])

In [11]:
tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999]

scores_glm_pure_premium = score_estimator(
    glm_pure_premium,
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test,
    target='PurePremium',
    weights='Exposure',
    tweedie_powers=tweedie_powers,
)

scores = pd.concat(
    [scores_glm_pure_premium],
    axis=1,
    sort=True,
    keys=('TweedieRegressor'),
)
print('Evaluation of the Product Model and the Tweedie Regressor on target PurePremium')
with pd.option_context('display.expand_frame_repr', False):
    print(scores)

x_df shape: (10000, 75), y_df shape: (10000, 2)
x_df shape: (14943, 75), y_df shape: (14943, 2)
Evaluation of the Product Model and the Tweedie Regressor on target PurePremium
                                      T              
subset                            train          test
metric                                               
D² explained               7.420000e-02 -2.560000e-02
mean Tweedie dev p=1.5000  9.264830e+01  1.132393e+02
mean Tweedie dev p=1.7000  1.803830e+01  2.119190e+01
mean Tweedie dev p=1.8000  8.144500e+00  9.357400e+00
mean Tweedie dev p=1.9000  3.733000e+00  4.186900e+00
mean Tweedie dev p=1.9900  1.874000e+00  2.053600e+00
mean Tweedie dev p=1.9990  1.750400e+00  1.913500e+00
mean Tweedie dev p=1.9999  1.738500e+00  1.900100e+00
mean abs. error            2.786880e+03  3.052893e+03
mean squared error         7.198422e+08  6.350466e+08


  scores = pd.concat(
