In [3]:
import pandas as pd
import lightgbm as lgb

In [4]:
print(lgb.__version__)

4.5.0


In [2]:
def train_test_split_fed(df_y, df_x, train_count = 10000):
    y_train = df_y[0:train_count]
    y_test = df_y[train_count+1:]
    x_train = df_x[0:train_count]
    x_test = df_x[train_count+1:]

    return y_train, y_test, x_train, x_test

In [3]:
df = pd.read_csv('data/insurance_claims_label_feature_fed.csv')

print('df shape: {}'.format(df.shape))
df = df[df['ClaimAmount'] > 0]
df.head()

df shape: (24944, 94)


Unnamed: 0,raw_id,example_id,event_date,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,...,X65,X66,X67,X68,X69,X70,X71,X72,X73,X74
0,1,1,20240901,1.0,0.75,F,7.0,1.0,61.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,2.257113
1,2,2,20240901,1.0,0.14,B,12.0,5.0,50.0,60.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,-1.045961
2,3,3,20240901,1.0,0.14,E,4.0,0.0,36.0,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,85.0,1.332797
3,4,4,20240901,2.0,0.62,F,10.0,0.0,51.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0,2.257113
4,5,5,20240901,1.0,0.31,A,5.0,0.0,45.0,50.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,50.0,-1.869529


In [4]:
columns_x = ['X{}'.format(idx) for idx in range(75)]
df_x = df[columns_x]

columns_y = ['PurePremium', 'Exposure']
df_y = df[columns_y]

print('df_x shape: {}'.format(df_x.shape))
df_x.head()

df_x shape: (24944, 75)


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X65,X66,X67,X68,X69,X70,X71,X72,X73,X74
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,2.257113
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,-1.045961
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,85.0,1.332797
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0,2.257113
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,50.0,-1.869529


In [5]:
# from sklearn.model_selection import train_test_split

# y_train, y_test, x_train, x_test = train_test_split(df_y, df_x, random_state=0)
y_train, y_test, x_train, x_test = train_test_split_fed(df_y, df_x, train_count=20000)

print('y_train shape: {}, \t y_test shape: {}, \n x_train shape: {}, \t x_test shape: {}'
      .format(y_train.shape, y_test.shape, x_train.shape, x_test.shape))

y_train shape: (20000, 2), 	 y_test shape: (4943, 2), 
 x_train shape: (20000, 75), 	 x_test shape: (4943, 75)


In [6]:
dtrain = lgb.Dataset(x_train, label=y_train['PurePremium'])

In [7]:
params = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.5,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l2': 1
}

model = lgb.train(params, dtrain)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 479
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 75
[LightGBM] [Info] Start training from score 9.160343


In [8]:
y_test_pred = model.predict(x_test)
y_train_pred = model.predict(x_train)

In [9]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_tweedie_deviance

mse = mean_squared_error(y_train['PurePremium'], y_train_pred, sample_weight=y_train['Exposure'])
print("Train Mean Squared Error:", mse)

mae = mean_absolute_error(y_train['PurePremium'], y_train_pred, sample_weight=y_train['Exposure'])
print("Train Mean Absolute Error:", mae)

mte = mean_tweedie_deviance(y_test['PurePremium'], y_test_pred, sample_weight=y_test['Exposure'])
print("Test Mean Tweedie Deviance:", mte)

Train Mean Squared Error: 467847326.6336257
Train Mean Absolute Error: 4729.878474765
Test Mean Tweedie Deviance: 388782263.06982046
