# Analysis of results

## Setup

In [15]:
# Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [16]:
# Import data
# df = pd.read_csv('../predictions/Random_Forest_with_rebalancing.csv')
df = pd.read_csv('../data/data_all_unique_values.csv')
df.info()


Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9466361 entries, 0 to 9466360
Data columns (total 21 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   ANO_SID                 float64
 2   CORPORATE_DEVISION      object 
 3   Bundesland              object 
 4   Typ                     object 
 5   ORTPLZ                  int64  
 6   CONSTRACTION_DESIGN     object 
 7   CONSTRUCTION_YEAR       float64
 8   WFL                     float64
 9   ZONE                    object 
 10  TYPE_OF_DEDUCTIBLE      int64  
 11  DRAIN_PIPE_INSURED      int64  
 12  PRODUCTLINE             object 
 13  PRIOR_DAMAGES           int64  
 14  UVV-KZ                  int64  
 15  UNDERWRITER             object 
 16  YEAR                    int64  
 17  DAMAGE_HEAVY_RAIN_ZONE  float64
 18  LONGITUDE               float64
 19  LATITUDE                float64
 20  DAMAGE                  int64  
dtypes: float64(6), int64(8), object

In [17]:
# Convert all zone values to string (to reduce amount of unique values)
print('Original unique values:', df['ZONE'].nunique())
df['ZONE'] = df['ZONE'].astype('str')
print('New unique values:', df['ZONE'].nunique())

Original unique values: 32
New unique values: 24


In [18]:
df_sub = df.sample(frac=0.001, random_state=1234)

In [19]:
# Get a color palette
palette = px.colors.qualitative.Plotly

## Data Visualisation

In [20]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CORPORATE_DEVISION').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CORPORATE_DEVISION').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='CORPORATE_DEVISION', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['devision_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by='total_count', ascending=True).reset_index(drop=True)

# Create a subplot
fig_division_1 = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Division Sizes", "Probability of Damage per Division"))

# Add horizontal bar chart for division size
fig_division_1.add_trace(go.Bar(
    y=summary['CORPORATE_DEVISION'],
    x=summary['total_count'],
    orientation='h',
    name='Division Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Add bar chart for percentage of damage
fig_division_1.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig_division_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Division Sizes and Probability of Damage",
                  yaxis_title="Total Size of Division",
                  xaxis2_title="Corporate Division",
                  yaxis2_title="Probability of Damage")

fig_division_1.show()

In [21]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CORPORATE_DEVISION').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CORPORATE_DEVISION').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='CORPORATE_DEVISION', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['devision_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_division_2 = make_subplots(rows=2, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Division Sizes", "Probability of Damage per Division"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['devision_percent'], palette)):
    fig_division_2.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['CORPORATE_DEVISION'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_division_2.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Corporate Division')
)

# Plot 2

# Add bar chart for percentage of damage
fig_division_2.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['CORPORATE_DEVISION'])]),
    showlegend=False
), row=2, col=1)

# Update layout
fig_division_2.update_layout(height=600, width=800,
                  title_text="Division Sizes and Probability of Damage",
                  xaxis2_title="Corporate Division",
                  yaxis2_title="Probability of Damage")

fig_division_2.show()

In [22]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('Bundesland').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('Bundesland').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='Bundesland', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['bundesland_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by='total_count', ascending=True).reset_index(drop=True)

# Create a subplot
fig_bundesland_1 = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Bundesland Sizes", "Probability of Damage per Bundesland"))

# Add horizontal bar chart
fig_bundesland_1.add_trace(go.Bar(
    y=summary['Bundesland'],
    x=summary['total_count'],
    orientation='h',
    name='Bundesland Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Add bar chart for percentage of damage
fig_bundesland_1.add_trace(go.Bar(
    x=summary['Bundesland'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig_bundesland_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Division Sizes and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig_bundesland_1.show()

In [23]:
# Filter data to keep only relevant years
df_years = df[(df['CONSTRUCTION_YEAR'] >= 1800) & (df['CONSTRUCTION_YEAR'] <= 2024)]

# Calculate the total number of observations and the number of damages for each year
total_counts = df_years.groupby('CONSTRUCTION_YEAR').size().reset_index(name='total_count')
damage_counts = df_years[df_years['DAMAGE'] == 1].groupby('CONSTRUCTION_YEAR').size().reset_index(name='damage_count')

# Merge the counts into a single DataFrame
summary = total_counts.merge(damage_counts, on='CONSTRUCTION_YEAR', how='left').fillna(0)

# Calculate the probability of damage = 1
summary['damage_prob'] = summary['damage_count'] / summary['total_count']

# Filter to only show results every 5 years
summary = summary[summary['CONSTRUCTION_YEAR'] % 5 == 0]

# Create the line chart
fig_year_1 = go.Figure()

fig_year_1.add_trace(go.Scatter(
    x=summary['CONSTRUCTION_YEAR'],
    y=summary['damage_prob'],
    mode='lines',
    name='Probability of Damage'
))

# Update layout
fig_year_1.update_layout(
    title='Probability of Damage over Construction Years',
    xaxis_title='Construction Year',
    yaxis_title='Probability of Damage',
    xaxis=dict(range=[1800, 2024])
)

fig_year_1.show()

In [24]:
# Apply log transformation to wfl
df_sub_wfl = df_sub.copy()
df_sub_wfl['LOG_WFL'] = np.log(df['WFL'] + 1)  # Adding 1 to avoid log(0)

# Violin plot with log-transformed wfl
fig_wfl_1 = px.violin(df_sub_wfl,
                x='LOG_WFL',
                y='DAMAGE',
                orientation='h',
                title='Living Space vs Damage (Log-transformed)',
                labels={'DAMAGE':'Damage', 'LOG_WFL':'Living Space in sqm (Log-Transformed)'}
                )

# Adding the mean
fig_wfl_1.update_traces(meanline_visible=True)
fig_wfl_1.show()

In [25]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('ZONE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('ZONE').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='ZONE', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['zone_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['ZONE'], ascending=False).reset_index(drop=True)

# Create a subplot
fig_zone_1 = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Zone Sizes", "Probability of Damage per Zone"))

# Add horizontal bar chart
fig_zone_1.add_trace(go.Bar(
    y=summary['ZONE'],
    x=summary['total_count'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['ZONE'], ascending=True).reset_index(drop=True)

# Add bar chart for percentage of damage
fig_zone_1.add_trace(go.Bar(
    x=summary['ZONE'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig_zone_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Zone Sizes and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig_zone_1.show()

In [26]:
# Violin plot
fig_rain_1 = px.violin(df_sub,
                x='DAMAGE_HEAVY_RAIN_ZONE',
                y='DAMAGE',
                orientation='h',
                title='Heavy Rain Zone vs Damage',
                labels={'DAMAGE':'Damage', 'DAMAGE_HEAVY_RAIN_ZONE':'Heavy Rain Zone'}
                )

# Adding the mean
fig_rain_1.update_traces(meanline_visible=True)
fig_rain_1.show()

In [27]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('TYPE_OF_DEDUCTIBLE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('TYPE_OF_DEDUCTIBLE').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='TYPE_OF_DEDUCTIBLE', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['type_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['TYPE_OF_DEDUCTIBLE'], ascending=False).reset_index(drop=True)

# Create a subplot
fig_deductible_1 = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Type of Deductible Size", "Probability of Damage per Type of Deductible"))

# Add horizontal bar chart
fig_deductible_1.add_trace(go.Bar(
    y=summary['TYPE_OF_DEDUCTIBLE'],
    x=summary['total_count'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['TYPE_OF_DEDUCTIBLE'], ascending=True).reset_index(drop=True)

# Add bar chart for percentage of damage
fig_deductible_1.add_trace(go.Bar(
    x=summary['TYPE_OF_DEDUCTIBLE'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig_deductible_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Type of Deductible and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig_deductible_1.show()

In [28]:
summary

Unnamed: 0,TYPE_OF_DEDUCTIBLE,total_count,damage_count,damage_prob,type_percent
0,0,9439542,135580.0,0.014363,0.9971669
1,2,8,0.0,0.0,8.450977e-07
2,3,21895,83.0,0.003791,0.002312927
3,4,3180,14.0,0.004403,0.0003359263
4,5,1736,9.0,0.005184,0.0001833862


In [29]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('PRIOR_DAMAGES').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('PRIOR_DAMAGES').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='PRIOR_DAMAGES', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['type_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['PRIOR_DAMAGES'], ascending=False).reset_index(drop=True)

# Create a subplot
fig_prior_1 = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Prior Damage Sizes", "Probability of Damage per Prior Damages"))

# Add horizontal bar chart
fig_prior_1.add_trace(go.Bar(
    y=summary['PRIOR_DAMAGES'],
    x=summary['total_count'],
    orientation='h',
    name='Prior Damage Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['PRIOR_DAMAGES'], ascending=True).reset_index(drop=True)

# Add bar chart for percentage of damage
fig_prior_1.add_trace(go.Bar(
    x=summary['PRIOR_DAMAGES'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig_prior_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Prior Damage and Probability of Damage",
                  yaxis2_title="Probability of Damage")

fig_prior_1.show()

In [30]:
summary

Unnamed: 0,PRIOR_DAMAGES,total_count,damage_count,damage_prob,type_percent
0,0,9407633,131391,0.013966,0.993796
1,1,44359,2781,0.062693,0.004686
2,2,11145,1020,0.091521,0.001177
3,3,2130,271,0.12723,0.000225
4,4,685,107,0.156204,7.2e-05
5,5,185,27,0.145946,2e-05
6,6,115,43,0.373913,1.2e-05
7,7,31,14,0.451613,3e-06
8,8,39,11,0.282051,4e-06
9,9,39,21,0.538462,4e-06


In [32]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('PRODUCTLINE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('PRODUCTLINE').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='PRODUCTLINE', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_line_1 = make_subplots(rows=2, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Product Line Sizes", "Probability of Damage per Product Line"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig_line_1.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['PRODUCTLINE'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_line_1.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Product Line')
)

# Plot 2

# Add bar chart for percentage of damage
fig_line_1.add_trace(go.Bar(
    x=summary['PRODUCTLINE'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['PRODUCTLINE'])]),
    showlegend=False
), row=2, col=1)

# Update layout
fig_line_1.update_layout(height=600, width=800,
                  title_text="Product Line and Probability of Damage",
                  xaxis2_title="Product Line",
                  yaxis2_title="Probability of Damage")

fig_line_1.show()

In [33]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('UNDERWRITER').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('UNDERWRITER').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='UNDERWRITER', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_underwriter_1 = make_subplots(rows=2, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Underwriter Sizes", "Probability of Damage per Underwriter"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig_underwriter_1.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['UNDERWRITER'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_underwriter_1.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Underwriter')
)

# Plot 2

# Add bar chart for percentage of damage
fig_underwriter_1.add_trace(go.Bar(
    x=summary['UNDERWRITER'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['UNDERWRITER'])]),
    showlegend=False
), row=2, col=1)

# Update layout
fig_underwriter_1.update_layout(height=600, width=800,
                  title_text="Underwriter and Probability of Damage",
                  xaxis2_title="Underwriter",
                  yaxis2_title="Probability of Damage")

fig_underwriter_1.show()

In [34]:
summary

Unnamed: 0,UNDERWRITER,total_count,damage_count,damage_prob,line_percent
0,Y,9465756,135373,0.014301,99.993609
1,N,605,313,0.517355,0.006391


## Create Report

In [35]:
# Save to file
with open("result_plots.html", "w", encoding="utf-8") as f:
    f.write(
        f"<html><head></head><body> \
        {fig_division_1.to_html(full_html=False)} \
        {fig_division_2.to_html(full_html=False)} \
        {fig_bundesland_1.to_html(full_html=False)} \
        {fig_year_1.to_html(full_html=False)} \
        {fig_wfl_1.to_html(full_html=False)} \
        {fig_zone_1.to_html(full_html=False)} \
        {fig_rain_1.to_html(full_html=False)} \
        {fig_deductible_1.to_html(full_html=False)} \
        {fig_prior_1.to_html(full_html=False)} \
        {fig_line_1.to_html(full_html=False)} \
        {fig_underwriter_1.to_html(full_html=False)} \
        </body></html>"
    )