<font size="3">Importing Required Libraries

In this section, we import the necessary libraries for data handling, visualization, preprocessing, and building the deep learning model.</font>

In [1]:
import pandas as pd
import numpy as np
import re
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import mean_absolute_error  , r2_score
import pickle

<font size="3">Loading and Exploring the Dataset

We load the dataset and check its structure. This helps us understand what kind of data we are working with.</font>

In [2]:
data_train = pd.read_csv('train.csv')
data_train.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


<font size="3">Exploratory Data Analysis (EDA)

In this section, we check the basic structure of the dataset and look for missing values or abnormal data.</font>

In [3]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188318 entries, 0 to 188317
Columns: 132 entries, id to loss
dtypes: float64(15), int64(1), object(116)
memory usage: 189.7+ MB


In [4]:
data_train.isnull().sum()

id        0
cat1      0
cat2      0
cat3      0
cat4      0
         ..
cont11    0
cont12    0
cont13    0
cont14    0
loss      0
Length: 132, dtype: int64

In [5]:
data_train.describe()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,294135.982561,0.493861,0.507188,0.498918,0.491812,0.487428,0.490945,0.48497,0.486437,0.485506,0.498066,0.493511,0.49315,0.493138,0.495717,3037.337686
std,169336.084867,0.18764,0.207202,0.202105,0.211292,0.209027,0.205273,0.17845,0.19937,0.18166,0.185877,0.209737,0.209427,0.212777,0.222488,2904.086186
min,1.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.179722,0.67
25%,147748.25,0.34609,0.358319,0.336963,0.327354,0.281143,0.336105,0.350175,0.3128,0.35897,0.36458,0.310961,0.311661,0.315758,0.29461,1204.46
50%,294539.5,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.438285,0.44106,0.44145,0.46119,0.457203,0.462286,0.363547,0.407403,2115.57
75%,440680.5,0.623912,0.681761,0.634224,0.652072,0.643315,0.655021,0.591045,0.62358,0.56682,0.61459,0.678924,0.675759,0.689974,0.724623,3864.045
max,587633.0,0.984975,0.862654,0.944251,0.954297,0.983674,0.997162,1.0,0.9802,0.9954,0.99498,0.998742,0.998484,0.988494,0.844848,121012.25


<font size="3">cont and categorical patterns</font>

In [6]:
cat_pattern = re.compile('^cat([0-9]|[0-9][0-9]|[0-9][0-9][0-9])$')
cont_pattern = re.compile('^cont([0-9]|[0-9][0-9]|[0-9][0-9][0-9])$')
cat_columns = sorted([cat for cat in data_train.columns if 'cat' in cat] , key= lambda s : int(s[3:]))
cont_columns = sorted([cont for cont in data_train.columns if 'cont' in cont] , key=lambda s: int(s[4:]))
cat_columns
cont_columns

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14']

In [7]:
data_train

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.335060,0.30260,0.67135,0.83510,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.60
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.27320,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.321570,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.21230,0.204687,0.202213,0.246011,0.432606,2763.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188313,587620,A,B,A,A,A,A,A,A,B,...,0.242437,0.289949,0.24564,0.30859,0.32935,0.223038,0.220003,0.333292,0.208216,1198.62
188314,587624,A,A,A,A,A,B,A,A,A,...,0.334270,0.382000,0.63475,0.40455,0.47779,0.307628,0.301921,0.318646,0.305872,1108.34
188315,587630,A,B,A,A,A,A,A,B,B,...,0.345883,0.370534,0.24564,0.45808,0.47779,0.445614,0.443374,0.339244,0.503888,5762.64
188316,587632,A,B,A,A,A,A,A,A,B,...,0.704364,0.562866,0.34987,0.44767,0.53881,0.863052,0.852865,0.654753,0.721707,1562.87


In [8]:
# drop id
data_train.drop(columns=['id'] , inplace=True)

<font size="3">Splitting Data into Training and Testing Sets

We split the dataset into training and testing sets using a fixed random seed for reproducibility.</font>

In [9]:
X = data_train.iloc[:,:-1]
y = data_train.loss
x_train , x_test , y_train , y_test = train_test_split(X , y , random_state=42)

<font size="3">find cat index</font>

In [10]:
cat_index = [i for i in range(0,len(data_train.columns)) if cat_pattern.match(data_train.columns[i])]
cat_index

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115]

In [11]:
np.asarray(cat_index)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115])

<font size="3">1)catboost model</font>

In [13]:
catboost_model = CatBoostRegressor(iterations=300 , learning_rate=0.05 , max_depth=6 , eval_metric='MAE' , task_type='GPU' , random_state=42)
catboost_model.fit(x_train , y_train , cat_features=np.asarray(cat_index)  , eval_set=(x_test , y_test))

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1926.5620867	test: 1918.2623619	best: 1918.2623619 (0)	total: 128ms	remaining: 38.1s
1:	total: 235ms	remaining: 35s
2:	total: 340ms	remaining: 33.7s
3:	total: 449ms	remaining: 33.2s
4:	total: 550ms	remaining: 32.4s
5:	learn: 1757.2448208	test: 1751.2722175	best: 1751.2722175 (5)	total: 652ms	remaining: 32s
6:	total: 747ms	remaining: 31.3s
7:	total: 849ms	remaining: 31s
8:	total: 945ms	remaining: 30.6s
9:	total: 1.05s	remaining: 30.4s
10:	learn: 1643.5955196	test: 1638.1570093	best: 1638.1570093 (10)	total: 1.15s	remaining: 30.3s
11:	total: 1.25s	remaining: 30s
12:	total: 1.33s	remaining: 29.4s
13:	total: 1.41s	remaining: 28.9s
14:	total: 1.51s	remaining: 28.8s
15:	learn: 1564.6990187	test: 1559.6822430	best: 1559.6822430 (15)	total: 1.62s	remaining: 28.7s
16:	total: 1.69s	remaining: 28.2s
17:	total: 1.8s	remaining: 28.2s
18:	total: 1.9s	remaining: 28.1s
19:	total: 2s	remaining: 28s
20:	learn: 1507.7006471	test: 1503.9233645	best: 1503.9233645 (20)	total: 2.11s	remaining: 28s


<catboost.core.CatBoostRegressor at 0x14cbf645a00>

<font size="3">test data accuracy with catboost model</font>

In [14]:
y_pred = catboost_model.predict(x_test)
print(f'mean_absolute_error = {mean_absolute_error(y_test , y_pred)}')

mean_absolute_error = 1206.6615223310732


converte y to log

In [16]:
X = data_train.iloc[:,:-1]
y = np.log(data_train.loss)
x_train , x_test , y_train , y_test = train_test_split(X , y , random_state=42)
catboost_model = CatBoostRegressor(iterations=300 , learning_rate=0.05 , max_depth=6 , eval_metric='MAE' , task_type='GPU' , random_state=42)
catboost_model.fit(x_train , y_train , cat_features=np.asarray(cat_index) , eval_set=(x_test , y_test))

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.6474047	test: 0.6473399	best: 0.6473399 (0)	total: 112ms	remaining: 33.5s
1:	total: 207ms	remaining: 30.8s
2:	total: 314ms	remaining: 31.1s
3:	total: 415ms	remaining: 30.7s
4:	total: 480ms	remaining: 28.3s
5:	learn: 0.5969259	test: 0.5969873	best: 0.5969873 (5)	total: 601ms	remaining: 29.4s
6:	total: 671ms	remaining: 28.1s
7:	total: 756ms	remaining: 27.6s
8:	total: 827ms	remaining: 26.7s
9:	total: 920ms	remaining: 26.7s
10:	learn: 0.5605698	test: 0.5605869	best: 0.5605869 (10)	total: 986ms	remaining: 25.9s
11:	total: 1.07s	remaining: 25.6s
12:	total: 1.15s	remaining: 25.5s
13:	total: 1.26s	remaining: 25.7s
14:	total: 1.34s	remaining: 25.4s
15:	learn: 0.5345847	test: 0.5346705	best: 0.5346705 (15)	total: 1.45s	remaining: 25.7s
16:	total: 1.51s	remaining: 25.2s
17:	total: 1.57s	remaining: 24.7s
18:	total: 1.67s	remaining: 24.7s
19:	total: 1.77s	remaining: 24.8s
20:	learn: 0.5160519	test: 0.5162423	best: 0.5162423 (20)	total: 1.87s	remaining: 24.9s
21:	total: 1.96s	remaining: 

<catboost.core.CatBoostRegressor at 0x14c83256c70>

<font size="3">Train data accuracy with catboost model</font>

In [17]:
y_pred = catboost_model.predict(x_train)
print(np.exp(mean_absolute_error(y_train , y_pred)))

1.5222304586280626


<font size="3">test data accuracy with catboost model</font>

In [18]:
y_pred = catboost_model.predict(x_test)
print(f'mean_absolute_error = {np.exp(mean_absolute_error(y_test , y_pred))}')

mean_absolute_error = 1.5250882640126069


<font size="3">save model for prediction new data</font>

In [19]:
with open('Catboost_Model_Insurance.txt' , 'wb') as file:
    pickle.dump(catboost_model , file)