# Data Cleaning

## Import packages

In [1]:
pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import plotly.express as px

In [5]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Import data

In [12]:
contractPath = '..\contract_all.csv'
damagePath = '..\mergedData\damage.csv'

contract_all = pd.read_csv(contractPath)
damage = pd.read_csv(damagePath)

In [28]:
damagePath = '..\data\csv\damage.csv'
damage = pd.read_csv(damagePath)

In [29]:
damage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394501 entries, 0 to 394500
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   DEPARTMENT_SEGMENT           394501 non-null  object 
 1   DEPARTMENT_RESPONSIBLE       394501 non-null  object 
 2   DEPARTMENT_CUSTOMER_SEGMENT  394501 non-null  object 
 3   ADDRESS_TYPE                 100241 non-null  object 
 4   DAMAGE_REPORTER              86642 non-null   float64
 5   EXPENSE                      394500 non-null  float64
 6   DAMAGE_CLUSTER_ID            305871 non-null  object 
 7   DATE_INITIAL_RESERVATION     394501 non-null  object 
 8   DATE_DAMAGE_EVENT            394501 non-null  object 
 9   DATE_COMPLETION_DATE         5105 non-null    object 
 10  PARTY_ID                     394480 non-null  object 
 11  CONTRACT_ID                  394501 non-null  int64  
 12  INITIAL_RESERVATION          394500 non-null  float64
 13 

In [14]:
contract_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11534325 entries, 0 to 11534324
Data columns (total 31 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   CONTRACT_ID                  float64
 1   CORPORATE_DEVISION           object 
 2   RISK_LOCATION_ZIP_CODE       object 
 3   RISK_LOCATION_TOWN           object 
 4   RISK_LOCATION_STREET         object 
 5   SUM_INSURED                  object 
 6   CONSTRACTION_DESIGN          object 
 7   CONSTRUCTION_YEAR            int64  
 8   BASEMENT                     object 
 9   BUILDING_TYPE                object 
 10  LIVING_SPACE                 float64
 11  ZONE                         object 
 12  NO_CLAIMS_SCALE              float64
 13  TYPE_OF_DEDUCTIBLE           int64  
 14  DEDUCTIBLE_PERCENTAGE        float64
 15  DEDUCTIBLE_MIN               int64  
 16  DEDUCTIBLE_MAX               int64  
 17  DRAIN_PIPE_INSURED           object 
 18  PRODUCTLINE                  object 
 19

## Cleaning damage data

Drop these variables:
- The vast majority of data in ADDRESS_TYPE, DAMAGE_REPORTER, DATE_COMPLETION_DATE and CUMULEVENT are missing.
- Exclude variables with only one (or very few/not relevant) category: 'DEPARTMENT_SEGMENT', 'DEPARTMENT_RESPONSIBLE', 'DEPARTMENT_CUSTOMER_SEGMENT', 'BUSINESS_TYPE_MARK', 'DAMAGE_BRANCH_DESIGNMENT', 'DAMAGE_BRANCH', 'SVCLUSTER', ‘BUSINESS_MARK’
- Not relevant: 'PARTY_ID', 'RESTORATION_CANDIDAT'

In [30]:
# Drop columns
columnsToDrop = ['DEPARTMENT_SEGMENT', 'DEPARTMENT_RESPONSIBLE', 'DEPARTMENT_CUSTOMER_SEGMENT', 'BUSINESS_TYPE_MARK', 'BUSINESS_MARK',
                 'DAMAGE_BRANCH_DESIGNMENT', 'DAMAGE_BRANCH', 'SVCLUSTER', 'ADDRESS_TYPE', 'DAMAGE_REPORTER', 'DATE_COMPLETION_DATE', 'PARTY_ID',
                 'CUMULEVENT', 'RESTORATION_CANDIDAT']

damage = damage.drop(columns=columnsToDrop, errors='ignore')

In [31]:
# Convert dates to date format
damage['DATE_DAMAGE_EVENT'] = pd.to_datetime(damage['DATE_DAMAGE_EVENT'])
damage['DATE_INITIAL_RESERVATION'] = pd.to_datetime(damage['DATE_INITIAL_RESERVATION'], dayfirst=True)

# Convert 'TOTAL_DAMAGE' to float
damage['TOTAL_DAMAGE'] = damage['TOTAL_DAMAGE'].replace('UNDEFINED', np.nan).astype('float64')

# Convert ids to object
damage['BUSINESS_FIELD_ID'] = damage['BUSINESS_FIELD_ID'].astype('object')
damage['CONTRACT_ID'] = damage['CONTRACT_ID'].astype('object')
damage['DAMAGE_TRAIT'] = damage['DAMAGE_TRAIT'].astype('object')
damage['DAMAGE_CAUSE'] = damage['DAMAGE_CAUSE'].astype('object')
damage['DAMAGE_ID'] = damage['DAMAGE_ID'].astype('object')
damage['STATISTIC_NUMBER'] = damage['STATISTIC_NUMBER'].astype('object')

In [32]:
# Drop observations outside of Germany
damage = damage[damage['DAMAGE_NATION'] == 'D']

# Drop column 'DAMAGE_NATION'
damage = damage.drop(columns=['DAMAGE_NATION'], errors='ignore')

len(damage)

305783

In [33]:
# Drop observations before 2014
damage = damage[damage['DATE_DAMAGE_EVENT'].dt.year >= 2014]
damage = damage[damage['DATE_INITIAL_RESERVATION'].dt.year >= 2014]
len(damage)

247134

Drop negative values from financial variables:

In [34]:
# Check number of negative values in financial columns
financialColumns = ['EXPENSE', 'INITIAL_RESERVATION', 'TOTAL_DAMAGE', 'RESIDUAL_AMOUNT', 'PAYOUT']

# Create a dictionary
negativeCounts = {}

# Count negative values in each column
for column in financialColumns:
    negativeCounts[column] = (damage[column] < 0).sum()

print(negativeCounts)

{'EXPENSE': 230, 'INITIAL_RESERVATION': 0, 'TOTAL_DAMAGE': 0, 'RESIDUAL_AMOUNT': 334, 'PAYOUT': 89}


In [35]:
# Create a mask that is True for rows where all financial columns are >= 0 or NaN
mask = (damage[financialColumns] >= 0) | damage[financialColumns].isna()

# Use .all(axis=1) to ensure that we only drop rows where all conditions are False (i.e., where all are negative)
damage = damage[mask.all(axis=1)]
len(damage)

246567

In [22]:
# # Exclude rows where EXPENSE, PAYOUT, and RESIDUAL_AMOUNT are all zero
# damage_all = damage_all[(damage_all['EXPENSE'] != 0) | (damage_all['PAYOUT'] != 0) | (damage_all['RESIDUAL_AMOUNT'] != 0)]
# print(len(damage_all))

# # Exclude rows where PIPE_PREMIUM_AMOUNT is na
# damage_all = damage_all.dropna(subset=['PIPE_PREMIUM_AMOUNT'])
# print(len(damage_all))

TODO Other changes:
- ZIP_CODE in Contract starts sometimes with a W, does it stand for something? If not, change these entries to NA?
- No_claim_Scale always 0 in contract14, in other years most of the time empty
- Damage_Event_Street is sometimes “Quartalsabrechnung” or “Monatsabrechnung”, specially if PAYOUT is really high
- einheitliche werte (ja, nein, j, n, 0, 1)

In [46]:
# Convert zip codes starting with 'W' or 'O', or those not 5 characters long to NaN
damage.loc[damage['DAMAGE_EVENT_ZIP_CODE'].str.startswith(('W', 'O')) | 
           (damage['DAMAGE_EVENT_ZIP_CODE'].str.len() != 5), 'DAMAGE_EVENT_ZIP_CODE'] = np.nan

## Cleaning contract data

Drop these variables:
- The majority of data is missing for the variables BASEMENT, BUILDING_TYPE, NO_CLAIMS_SCALE, NO_CLAIMS_YEARS.
- Not relevant for our analysis: 'TYPE_OF_DEDUCTIBLE', 'DEDUCTIBLE_PERCENTAGE', 'DEDUCTIBLE_MIN', 'DEDUCTIBLE_MAX', 'INVENTORY_MANAGEMENT_SYSTEM', 'PARTY-ID', 'RISK_NATION'

In [23]:
# Drop columns from all dataframes
columnsToDrop = ['TYPE_OF_DEDUCTIBLE', 'DEDUCTIBLE_PERCENTAGE', 'DEDUCTIBLE_MIN', 'DEDUCTIBLE_MAX', 'INVENTORY_MANAGEMENT_SYSTEM',
                'BASEMENT', 'BUILDING_TYPE', 'NO_CLAIMS_SCALE', 'NO_CLAIMS_YEARS', 'RISK_NATION', 'PARTY-ID']

for i in range(14, 25):
    dfName = f'contract{i}'
    if dfName in globals():
        globals()[dfName] = globals()[dfName].drop(columns=columnsToDrop, errors='ignore')

In [None]:
# # Convert dates to date format
# contract['KEYDATE'] = pd.to_datetime(contract['KEYDATE'])

# # Convert ids to object
# contract['CONTRACT_ID'] = contract['CONTRACT_ID'].astype('object')

# Convert PREMIUM_AMOUNT to float
# damage_all['PREMIUM_AMOUNT'] = damage_all['PREMIUM_AMOUNT'].astype(float)

Drop negative values from financial variables (there are none):

In [25]:
# Check number of negative values in financial columns
financialColumns = ['SUM_INSURED']

# Create a dictionary
negativeCounts = {}

# # Count negative values in each dataset
for i in range(14, 25):
    dfName = f'contract{i}'
    if dfName in globals():
        negativeCounts[dfName] = (globals()[dfName][financialColumns] < 0).sum()

print(negativeCounts) # nothing to drop

{'contract14': SUM_INSURED    0
dtype: int64, 'contract15': SUM_INSURED    0
dtype: int64, 'contract16': SUM_INSURED    0
dtype: int64, 'contract17': SUM_INSURED    0
dtype: int64, 'contract18': SUM_INSURED    0
dtype: int64, 'contract19': SUM_INSURED    0
dtype: int64, 'contract20': SUM_INSURED    0
dtype: int64, 'contract21': SUM_INSURED    0
dtype: int64, 'contract22': SUM_INSURED    0
dtype: int64, 'contract23': SUM_INSURED    0
dtype: int64, 'contract24': SUM_INSURED    0
dtype: int64}


In [26]:
# Exclude values where the construction year is in the future
# Set current year
currentYear = 2024

# Loop through each DataFrame and set future 'CONSTRUCTION_YEAR' values to NaN
for i in range(14, 25):
    df_name = f'contract{i}'
    if df_name in globals():
        # Access the DataFrame
        df = globals()[df_name]
        # Identify rows where 'CONSTRUCTION_YEAR' is in the future and set to NaN
        df.loc[df['CONSTRUCTION_YEAR'] > currentYear, 'CONSTRUCTION_YEAR'] = np.nan
        # Update the DataFrame in the global namespace
        globals()[df_name] = df
        # Print the number of rows with NaN 'CONSTRUCTION_YEAR' to verify
        print(f"Number of NaN 'CONSTRUCTION_YEAR' in {df_name}: {df['CONSTRUCTION_YEAR'].isna().sum()}")

Number of NaN 'CONSTRUCTION_YEAR' in contract14: 85
Number of NaN 'CONSTRUCTION_YEAR' in contract15: 80
Number of NaN 'CONSTRUCTION_YEAR' in contract16: 71
Number of NaN 'CONSTRUCTION_YEAR' in contract17: 61
Number of NaN 'CONSTRUCTION_YEAR' in contract18: 56
Number of NaN 'CONSTRUCTION_YEAR' in contract19: 51
Number of NaN 'CONSTRUCTION_YEAR' in contract20: 48
Number of NaN 'CONSTRUCTION_YEAR' in contract21: 45
Number of NaN 'CONSTRUCTION_YEAR' in contract22: 41
Number of NaN 'CONSTRUCTION_YEAR' in contract23: 37
Number of NaN 'CONSTRUCTION_YEAR' in contract24: 36


TODO Other changes:
- ZIP_CODE in Contract starts sometimes with a W, does it stand for something? If not, change these entries to NA?
- Sum_Insured sometimes 0 or 1
- Construction_Design is 00, change it to NA?
- Construction Year is in future or 0, 1, 1111; why? Change 111111 into NAs
- Living Space is 0 or 1 sometimes, change it to N?
- einheitliche werte (ja, nein, j, n, 0, 1)

In [None]:
# Set 'SUM_INSURED' to NaN where it's 0 or 1
# contract.loc[contract['SUM_INSURED'].isin([0, 1]), 'SUM_INSURED'] = np.nan

# Set 'LIVING_SPACE' to NaN where it's 0 or 1
# contract.loc[contract['LIVING_SPACE'].isin([0, 1]), 'LIVING_SPACE'] = np.nan

In [None]:
# # Exclude values where the construction year is in the future
# # Set the current year
# currentYear = 2024

# # Conditions for setting NaN
# # Future years or specific unrealistic years
# contract.loc[(contract['CONSTRUCTION_YEAR'] > currentYear) | 
#              (contract['CONSTRUCTION_YEAR'].isin([1111, 1, 0])), 'CONSTRUCTION_YEAR'] = np.nan

In [None]:
# # Replace values in the 'DRAIN_PIPE_INSURED' column
# contract['DRAIN_PIPE_INSURED'] = contract['DRAIN_PIPE_INSURED'].replace({'J': 1, 'N': 0})

# # Replace values in the 'UNDERINSURANCE_WAIVER' column
# contract['UNDERINSURANCE_WAIVER'] = contract['UNDERINSURANCE_WAIVER'].replace({'J': 1, 'N': 0})

In [None]:
# # Convert zip codes starting with 'W' or 'O', or those not 5 characters long to NaN
# contract.loc[contract['RISK_LOCATION_ZIP_CODE'].str.startswith(('W', 'O')) | 
#            (contract['RISK_LOCATION_ZIP_CODE'].str.len() != 5), 'RISK_LOCATION_ZIP_CODE'] = np.nan