In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [8]:
# Import data
contracts = pd.read_csv('../../data/contracts_clean_final.csv')
contracts.info()

  contracts = pd.read_csv('../../data/contracts_clean_final.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10353244 entries, 0 to 10353243
Data columns (total 30 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   ANO_SID                 float64
 2   CORPORATE_DEVISION      object 
 3   Bundesland              object 
 4   Kreis                   object 
 5   Typ                     object 
 6   ORTPLZ                  int64  
 7   ORTS-NAME               object 
 8   STRASSE                 object 
 9   SUM_INSURED             float64
 10  CONSTRACTION_DESIGN     object 
 11  CONSTRUCTION_YEAR       float64
 12  WFL                     float64
 13  ZONE                    object 
 14  SF-SYSTEM               float64
 15  TYPE_OF_DEDUCTIBLE      int64  
 16  DRAIN_PIPE_INSURED      int64  
 17  PRODUCTLINE             object 
 18  PRIOR_DAMAGES           int64  
 19  UVV-KZ                  int64  
 20  UNDERWRITER             object 
 21  PARTY-ID                objec

In [9]:
# Clean data that is not needed for the model (or would be too hard to encode)
contracts = contracts.drop(['Unnamed: 0', 'SF-SYSTEM', 'ANO_SID', 'ORTS-NAME', 'STRASSE', 'PARTY-ID', 'contract_year',
                            'Kreis', 'DAMAGE_FLOOD_ZONE'], axis=1)
contracts = contracts.dropna()

In [10]:
# Convert all zone values to string (to reduce amount of unique values)
print('Original unique values:', contracts['ZONE'].nunique())
contracts['ZONE'] = contracts['ZONE'].astype('str')
print('New unique values:', contracts['ZONE'].nunique())

Original unique values: 27
New unique values: 18


In [11]:
# Prepare binary variables
columns_to_encode = contracts.select_dtypes(include=['object']).columns
df_binary = pd.get_dummies(contracts, columns=columns_to_encode, drop_first=True)
df_binary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7163047 entries, 0 to 10353243
Data columns (total 63 columns):
 #   Column                                         Dtype  
---  ------                                         -----  
 0   ORTPLZ                                         int64  
 1   SUM_INSURED                                    float64
 2   CONSTRUCTION_YEAR                              float64
 3   WFL                                            float64
 4   TYPE_OF_DEDUCTIBLE                             int64  
 5   DRAIN_PIPE_INSURED                             int64  
 6   PRIOR_DAMAGES                                  int64  
 7   UVV-KZ                                         int64  
 8   PIPE_PREMIUM_AMOUNT                            float64
 9   YEAR                                           int64  
 10  DAMAGE_HEAVY_RAIN_ZONE                         float64
 11  LONGITUDE                                      float64
 12  LATITUDE                                      

In [12]:
# Training data: 2014; testing data: 2015

# full data
train = df_binary[df_binary['YEAR']==2014]
test = df_binary[df_binary['YEAR']==2015]

In [15]:
# Resample classes to ratio 1:4
# Only resample training data, for every year you're running the model on
# Don't resample test data

class_0 = train[train['DAMAGE'] == 0]
class_1 = train[train['DAMAGE'] == 1]
print(f"Before rebalancing: Class 0 = {len(class_0)}, Class 1 = {len(class_1)}")

# Determine the number of samples needed for each class
n_samples_0 = len(class_0)
n_samples_1 = n_samples_0 // 4

# Resample class 1
class_1_resampled = resample(class_1, 
                                replace=True, 
                                n_samples=n_samples_1, 
                                random_state=1234)

# Combine the resampled class 1 with class 0
train_resampled = pd.concat([class_0, class_1_resampled])

# Print lengths after rebalancing
print(f"After rebalancing: Class 0 = {len(train_resampled[train_resampled['DAMAGE'] == 0])}, Class 1 = {len(train_resampled[train_resampled['DAMAGE'] == 1])}")

Before rebalancing: Class 0 = 614492, Class 1 = 6760
After rebalancing: Class 0 = 614492, Class 1 = 153623


In [16]:
# Separate features and target variable
X_train = train_resampled.drop(['YEAR', 'DAMAGE'], axis=1)
y_train = train_resampled['DAMAGE']

X_test = test.drop(['YEAR', 'DAMAGE'], axis=1)
y_test = test['DAMAGE']