# Preprocessing for models

## Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [2]:
# Import data
contracts = pd.read_csv('../../data/contracts_clean_final.csv')

  contracts = pd.read_csv('../../data/contracts_clean_final.csv')


In [3]:
# Clean data that is not needed for the model (or would be too hard to encode)
contracts = contracts.drop(['Unnamed: 0', 'SF-SYSTEM', 'ORTS-NAME', 'STRASSE', 'PARTY-ID', 'contract_year',
                            'Kreis', 'DAMAGE_FLOOD_ZONE', 'SUM_INSURED', 'PIPE_PREMIUM_AMOUNT'], axis=1)

## Analysis of missing unique values

In [24]:
# Check number of nas
print('Number of NaNs:')
contracts.isna().sum()

Number of NaNs:


CORPORATE_DEVISION              0
Bundesland                 611207
Typ                        611207
ORTPLZ                          0
CONSTRACTION_DESIGN       1083267
CONSTRUCTION_YEAR               0
WFL                         14991
ZONE                       873474
TYPE_OF_DEDUCTIBLE              0
DRAIN_PIPE_INSURED              0
PRODUCTLINE               1583600
PRIOR_DAMAGES                   0
UVV-KZ                          0
UNDERWRITER                     0
YEAR                            0
DAMAGE_HEAVY_RAIN_ZONE          0
LONGITUDE                  873522
LATITUDE                   873522
DAMAGE                          0
dtype: int64

In [5]:
# Check unique values of problematic variables
print('Number of observations per unique value:')
print(contracts['CORPORATE_DEVISION'].value_counts())
print(contracts['TYPE_OF_DEDUCTIBLE'].value_counts())

Number of observations per unique value:
CORPORATE_DEVISION
VHV    6328447
VGV    2869997
W&W     621742
H&H     533058
Name: count, dtype: int64
TYPE_OF_DEDUCTIBLE
0    10324735
3       23310
4        3322
5        1869
2           8
Name: count, dtype: int64


In [6]:
# Drop nas as they are not supported in the model
contracts_drop_all = contracts.dropna()

In [7]:
# Alternative approach: only drop nas in some columns
columns_to_exclude = ['PRODUCTLINE', 'CONSTRACTION_DESIGN'] 
columns_to_check = [col for col in contracts.columns if col not in columns_to_exclude]
contracts_drop = contracts.dropna(subset=columns_to_check)

In [8]:
# Unique values only disappear if you drop nas in productline and constraction design
print('Number of observations per unique value:')
print(contracts_drop['CORPORATE_DEVISION'].value_counts())
print(contracts_drop['TYPE_OF_DEDUCTIBLE'].value_counts())

Number of observations per unique value:
CORPORATE_DEVISION
VHV    5887776
VGV    1779063
W&W     556265
H&H     465288
Name: count, dtype: int64
TYPE_OF_DEDUCTIBLE
0    8661573
3      21895
4       3180
5       1736
2          8
Name: count, dtype: int64


In [9]:
print(len(contracts))
print(len(contracts_drop))
print(len(contracts_drop_all))

10353244
8688392
7163047


In [10]:
print(contracts[contracts['CORPORATE_DEVISION']=='W&W']['PRODUCTLINE'].unique())
print(contracts[contracts['CORPORATE_DEVISION']=='H&H']['PRODUCTLINE'].unique())
print(contracts[contracts['CORPORATE_DEVISION']=='W&W']['CONSTRACTION_DESIGN'].unique())
print(contracts[contracts['CORPORATE_DEVISION']=='H&H']['CONSTRACTION_DESIGN'].unique())

[nan]
[nan]
[nan]
[nan 'NORMAL_VENTURE' 'PREFAB_HOUSE']


## Model preprocessing

In [4]:
# In productline and constraction design, substitute nas with 'unknown'
contracts['PRODUCTLINE'] = contracts['PRODUCTLINE'].fillna('UNKNOWN')
contracts['CONSTRACTION_DESIGN'] = contracts['CONSTRACTION_DESIGN'].fillna('UNKNOWN')

In [6]:
# Convert all zone values to string (to reduce amount of unique values)
print('Original unique values:', contracts['ZONE'].nunique())
contracts['ZONE'] = contracts['ZONE'].astype('str')
print('New unique values:', contracts['ZONE'].nunique())

Original unique values: 32
New unique values: 24


In [7]:
# Drop nas for the model
contracts = contracts.dropna()

In [8]:
contracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9466361 entries, 0 to 10353243
Data columns (total 20 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   ANO_SID                 float64
 1   CORPORATE_DEVISION      object 
 2   Bundesland              object 
 3   Typ                     object 
 4   ORTPLZ                  int64  
 5   CONSTRACTION_DESIGN     object 
 6   CONSTRUCTION_YEAR       float64
 7   WFL                     float64
 8   ZONE                    object 
 9   TYPE_OF_DEDUCTIBLE      int64  
 10  DRAIN_PIPE_INSURED      int64  
 11  PRODUCTLINE             object 
 12  PRIOR_DAMAGES           int64  
 13  UVV-KZ                  int64  
 14  UNDERWRITER             object 
 15  YEAR                    int64  
 16  DAMAGE_HEAVY_RAIN_ZONE  float64
 17  LONGITUDE               float64
 18  LATITUDE                float64
 19  DAMAGE                  int64  
dtypes: float64(6), int64(7), object(7)
memory usage: 1.5+ GB


In [9]:
print('Number of observations per unique value:')
print(contracts['CORPORATE_DEVISION'].value_counts())
print(contracts['TYPE_OF_DEDUCTIBLE'].value_counts())
print(contracts['PRODUCTLINE'].value_counts())
print(contracts['CONSTRACTION_DESIGN'].value_counts())

Number of observations per unique value:
CORPORATE_DEVISION
VHV    5887776
VGV    2557032
W&W     556265
H&H     465288
Name: count, dtype: int64
TYPE_OF_DEDUCTIBLE
0    9439542
3      21895
4       3180
5       1736
2          8
Name: count, dtype: int64
PRODUCTLINE
Top        4302512
Sonst      2043268
UNKNOWN    1413122
Kompakt     713760
Basis       507899
Plus        306973
Premium     178827
Name: count, dtype: int64
CONSTRACTION_DESIGN
NORMAL_VENTURE               8320660
UNKNOWN                       963827
PREFAB_HOUSE                   86763
DESIGN_CLASS_I                 76741
PREDOMINANTLY_WOODEN_ROOF      14358
0                               1302
PREFAB_HOUSE_II                 1039
PREFAB_HOUSE_I                   475
PREFAB_HOUSE_III                 452
DESIGN_CLASS_III                 366
DESIGN_CLASS_IV                  184
CARAVAN_MOTORHOME                113
DESIGN_CLASS_II                   45
DESIGN_CLASS_V                    36
Name: count, dtype: int64


In [10]:
# Save new dataset to csv
contracts.to_csv('../../data/data_all_unique_values.csv')

In [16]:
# Drop id first
# Prepare binary variables
columns_to_encode = contracts.select_dtypes(include=['object']).columns
df_binary = pd.get_dummies(contracts, columns=columns_to_encode, drop_first=True)
df_binary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8688392 entries, 0 to 10353243
Data columns (total 54 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   ORTPLZ                             int64  
 1   CONSTRUCTION_YEAR                  float64
 2   WFL                                float64
 3   TYPE_OF_DEDUCTIBLE                 int64  
 4   DRAIN_PIPE_INSURED                 int64  
 5   PRIOR_DAMAGES                      int64  
 6   UVV-KZ                             int64  
 7   YEAR                               int64  
 8   DAMAGE_HEAVY_RAIN_ZONE             float64
 9   LONGITUDE                          float64
 10  LATITUDE                           float64
 11  DAMAGE                             int64  
 12  CORPORATE_DEVISION_VGV             bool   
 13  CORPORATE_DEVISION_VHV             bool   
 14  CORPORATE_DEVISION_W&W             bool   
 15  Bundesland_Bayern                  bool   
 16  Bundesland_Berlin     

In [17]:
# Training data: 2014; testing data: 2015

# full data
train = df_binary[df_binary['YEAR']==2014]
test = df_binary[df_binary['YEAR']==2015]

In [18]:
# Resample classes to ratio 1:4
# Only resample training data, for every year you're running the model on
# Don't resample test data

class_0 = train[train['DAMAGE'] == 0]
class_1 = train[train['DAMAGE'] == 1]
print(f"Before rebalancing: Class 0 = {len(class_0)}, Class 1 = {len(class_1)}")

# Determine the number of samples needed for each class
n_samples_0 = len(class_0) // 2  # Undersample class 0 to half its size
n_samples_1 = n_samples_0 // 4      # Oversample class 1 to be 1/4 of the undersampled class 0

# Resample class 0 (undersample)
class_0_resampled = resample(class_0, 
                                replace=False, 
                                n_samples=n_samples_0, 
                                random_state=42)

# Resample class 1 (oversample)
class_1_resampled = resample(class_1, 
                                replace=True, 
                                n_samples=n_samples_1, 
                                random_state=42)

# Combine the resampled classes
train_resampled = pd.concat([class_0_resampled, class_1_resampled])

# Print lengths after rebalancing
print(f"After rebalancing: Class 0 = {len(train_resampled[train_resampled['DAMAGE'] == 0])}, Class 1 = {len(train_resampled[train_resampled['DAMAGE'] == 1])}")

Before rebalancing: Class 0 = 718984, Class 1 = 7958
After rebalancing: Class 0 = 359492, Class 1 = 89873


In [19]:
# Separate features and target variable
X_train = train_resampled.drop(['YEAR', 'DAMAGE'], axis=1)
y_train = train_resampled['DAMAGE']

X_test = test.drop(['YEAR', 'DAMAGE'], axis=1)
y_test = test['DAMAGE']