# Analysis of results

## Setup

In [1]:
# Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# Import data
# df = pd.read_csv('../predictions/Random_Forest_with_rebalancing.csv')
df = pd.read_csv('../data/data_all_unique_values.csv')
df.info()

  df = pd.read_csv('../data/data_all_unique_values.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8688392 entries, 0 to 8688391
Data columns (total 20 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   CORPORATE_DEVISION      object 
 2   Bundesland              object 
 3   Typ                     object 
 4   ORTPLZ                  int64  
 5   CONSTRACTION_DESIGN     object 
 6   CONSTRUCTION_YEAR       float64
 7   WFL                     float64
 8   ZONE                    object 
 9   TYPE_OF_DEDUCTIBLE      int64  
 10  DRAIN_PIPE_INSURED      int64  
 11  PRODUCTLINE             object 
 12  PRIOR_DAMAGES           int64  
 13  UVV-KZ                  int64  
 14  UNDERWRITER             object 
 15  YEAR                    int64  
 16  DAMAGE_HEAVY_RAIN_ZONE  float64
 17  LONGITUDE               float64
 18  LATITUDE                float64
 19  DAMAGE                  int64  
dtypes: float64(5), int64(8), object(7)
memory usage: 1.3+ GB


In [26]:
# Convert all zone values to string (to reduce amount of unique values)
print('Original unique values:', df['ZONE'].nunique())
df['ZONE'] = df['ZONE'].astype('str')
print('New unique values:', df['ZONE'].nunique())

Original unique values: 32
New unique values: 23


In [9]:
df_sub = df.sample(frac=0.001, random_state=1234)

## Data Visualisation

In [4]:
# Get a color palette
palette = px.colors.qualitative.Plotly

In [18]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CORPORATE_DEVISION').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CORPORATE_DEVISION').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='CORPORATE_DEVISION', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['devision_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by='total_count', ascending=True).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Division Sizes", "Probability of Damage per Division"))

# Add horizontal bar chart for division size
fig.add_trace(go.Bar(
    y=summary['CORPORATE_DEVISION'],
    x=summary['total_count'],
    orientation='h',
    name='Division Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Division Sizes and Probability of Damage",
                  yaxis_title="Total Size of Division",
                  xaxis2_title="Corporate Division",
                  yaxis2_title="Probability of Damage")

fig.show()

In [20]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CORPORATE_DEVISION').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CORPORATE_DEVISION').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='CORPORATE_DEVISION', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['devision_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Division Sizes", "Probability of Damage per Division"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['devision_percent'], palette)):
    fig.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['CORPORATE_DEVISION'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Corporate Division')
)

# Plot 2

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['CORPORATE_DEVISION'])]),
    showlegend=False
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800,
                  title_text="Division Sizes and Probability of Damage",
                  xaxis2_title="Corporate Division",
                  yaxis2_title="Probability of Damage")

fig.show()

In [21]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('Bundesland').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('Bundesland').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='Bundesland', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['bundesland_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by='total_count', ascending=True).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Bundesland Sizes", "Probability of Damage per Bundesland"))

# Add horizontal bar chart
fig.add_trace(go.Bar(
    y=summary['Bundesland'],
    x=summary['total_count'],
    orientation='h',
    name='Bundesland Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['Bundesland'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Division Sizes and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig.show()

In [22]:
# Filter data to keep only relevant years
df_years = df[(df['CONSTRUCTION_YEAR'] >= 1800) & (df['CONSTRUCTION_YEAR'] <= 2024)]

# Calculate the total number of observations and the number of damages for each year
total_counts = df_years.groupby('CONSTRUCTION_YEAR').size().reset_index(name='total_count')
damage_counts = df_years[df_years['DAMAGE'] == 1].groupby('CONSTRUCTION_YEAR').size().reset_index(name='damage_count')

# Merge the counts into a single DataFrame
summary = total_counts.merge(damage_counts, on='CONSTRUCTION_YEAR', how='left').fillna(0)

# Calculate the probability of damage = 1
summary['damage_prob'] = summary['damage_count'] / summary['total_count']

# Filter to only show results every 5 years
summary = summary[summary['CONSTRUCTION_YEAR'] % 5 == 0]

# Create the line chart
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=summary['CONSTRUCTION_YEAR'],
    y=summary['damage_prob'],
    mode='lines',
    name='Probability of Damage'
))

# Update layout
fig.update_layout(
    title='Probability of Damage over Construction Years',
    xaxis_title='Construction Year',
    yaxis_title='Probability of Damage',
    xaxis=dict(range=[1800, 2024])
)

fig.show()

In [28]:
# Apply log transformation to wfl
df_sub_wfl = df_sub.copy()
df_sub_wfl['LOG_WFL'] = np.log(df['WFL'] + 1)  # Adding 1 to avoid log(0)

# Violin plot with log-transformed wfl
fig = px.violin(df_sub_wfl,
                x='LOG_WFL',
                y='DAMAGE',
                orientation='h',
                title='Living Space vs Damage (Log-transformed)',
                labels={'DAMAGE':'Damage', 'LOG_WFL':'Living Space in sqm (Log-Transformed)'}
                )

# Adding the mean
fig.update_traces(meanline_visible=True)
fig.show()

In [34]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('ZONE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('ZONE').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='ZONE', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['zone_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['ZONE'], ascending=False).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Zone Sizes", "Probability of Damage per Zone"))

# Add horizontal bar chart
fig.add_trace(go.Bar(
    y=summary['ZONE'],
    x=summary['total_count'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['ZONE'], ascending=True).reset_index(drop=True)

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['ZONE'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Zone Sizes and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig.show()

In [43]:
# Violin plot
fig = px.violin(df_sub,
                x='DAMAGE_HEAVY_RAIN_ZONE',
                y='DAMAGE',
                orientation='h',
                title='Heavy Rain Zone vs Damage',
                labels={'DAMAGE':'Damage', 'DAMAGE_HEAVY_RAIN_ZONE':'Heavy Rain Zone'}
                )

# Adding the mean
fig.update_traces(meanline_visible=True)
fig.show()

In [44]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('TYPE_OF_DEDUCTIBLE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('TYPE_OF_DEDUCTIBLE').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='TYPE_OF_DEDUCTIBLE', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['type_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['TYPE_OF_DEDUCTIBLE'], ascending=False).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Type of Deductible Size", "Probability of Damage per Type of Deductible"))

# Add horizontal bar chart
fig.add_trace(go.Bar(
    y=summary['TYPE_OF_DEDUCTIBLE'],
    x=summary['total_count'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['TYPE_OF_DEDUCTIBLE'], ascending=True).reset_index(drop=True)

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['TYPE_OF_DEDUCTIBLE'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Type of Deductible and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig.show()

In [45]:
summary

Unnamed: 0,TYPE_OF_DEDUCTIBLE,total_count,damage_count,damage_prob,type_percent
0,0,8661573,100187.0,0.011567,0.9969132
1,2,8,0.0,0.0,9.207688e-07
2,3,21895,83.0,0.003791,0.002520029
3,4,3180,14.0,0.004403,0.0003660056
4,5,1736,9.0,0.005184,0.0001998068


In [51]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('PRIOR_DAMAGES').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('PRIOR_DAMAGES').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='PRIOR_DAMAGES', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['type_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['PRIOR_DAMAGES'], ascending=False).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Type of Deductible Size", "Probability of Damage per Type of Deductible"))

# Add horizontal bar chart
fig.add_trace(go.Bar(
    y=summary['PRIOR_DAMAGES'],
    x=summary['total_count'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['PRIOR_DAMAGES'], ascending=True).reset_index(drop=True)

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['PRIOR_DAMAGES'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Type of Deductible and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig.show()

In [52]:
summary

Unnamed: 0,PRIOR_DAMAGES,total_count,damage_count,damage_prob,type_percent
0,0,8630197,96034,0.011128,0.993302
1,1,44071,2767,0.062785,0.005072
2,2,10969,1001,0.091257,0.001262
3,3,2068,268,0.129594,0.000238
4,4,678,107,0.157817,7.8e-05
5,5,185,27,0.145946,2.1e-05
6,6,115,43,0.373913,1.3e-05
7,7,31,14,0.451613,4e-06
8,8,39,11,0.282051,4e-06
9,9,39,21,0.538462,4e-06


In [47]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('PRODUCTLINE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('PRODUCTLINE').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='PRODUCTLINE', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Product Line Sizes", "Probability of Damage per Product Line"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['PRODUCTLINE'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Product Line')
)

# Plot 2

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['PRODUCTLINE'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['PRODUCTLINE'])]),
    showlegend=False
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800,
                  title_text="Product Line and Probability of Damage",
                  xaxis2_title="Product Line",
                  yaxis2_title="Probability of Damage")

fig.show()

In [53]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('UNDERWRITER').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('UNDERWRITER').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='UNDERWRITER', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig = make_subplots(rows=2, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Product Line Sizes", "Probability of Damage per Product Line"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['UNDERWRITER'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Product Line')
)

# Plot 2

# Add bar chart for percentage of damage
fig.add_trace(go.Bar(
    x=summary['UNDERWRITER'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['UNDERWRITER'])]),
    showlegend=False
), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800,
                  title_text="Product Line and Probability of Damage",
                  xaxis2_title="Product Line",
                  yaxis2_title="Probability of Damage")

fig.show()

In [54]:
summary

Unnamed: 0,UNDERWRITER,total_count,damage_count,damage_prob,line_percent
0,Y,8687969,100056,0.011517,99.995131
1,N,423,237,0.560284,0.004869
