# Analysis of results

## Setup

In [44]:
# Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats

In [28]:
# Import data
df = pd.read_csv('../../predictions/Random_Forest_with_rebalancing_2.csv')
df.info()


Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7961450 entries, 0 to 7961449
Data columns (total 22 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   ANO_SID                 float64
 2   CORPORATE_DEVISION      object 
 3   Bundesland              object 
 4   Typ                     object 
 5   ORTPLZ                  int64  
 6   CONSTRACTION_DESIGN     object 
 7   CONSTRUCTION_YEAR       float64
 8   WFL                     float64
 9   ZONE                    object 
 10  TYPE_OF_DEDUCTIBLE      int64  
 11  DRAIN_PIPE_INSURED      int64  
 12  PRODUCTLINE             object 
 13  PRIOR_DAMAGES           int64  
 14  UVV-KZ                  int64  
 15  UNDERWRITER             object 
 16  YEAR                    int64  
 17  DAMAGE_HEAVY_RAIN_ZONE  float64
 18  LONGITUDE               float64
 19  LATITUDE                float64
 20  DAMAGE                  int64  
 21  Probability             float64

In [29]:
# Convert all zone values to string (to reduce amount of unique values)
print('Original unique values:', df['ZONE'].nunique())
df['ZONE'] = df['ZONE'].astype('str')
print('New unique values:', df['ZONE'].nunique())

Original unique values: 32
New unique values: 23


In [30]:
# Compute prediction
df['PREDICTION'] = np.where(df['Probability'] < 0.5, 0, 1)

In [31]:
# Compute prediction type
df['PREDICTION_TYPE'] = np.where((df['DAMAGE'] == 1) & (df['PREDICTION'] == 1), 'tp',
                                   np.where((df['DAMAGE'] == 0) & (df['PREDICTION'] == 0), 'tn',
                                            np.where((df['DAMAGE'] == 1) & (df['PREDICTION'] == 0), 'fn',
                                                     np.where((df['DAMAGE'] == 0) & (df['PREDICTION'] == 1), 'fp', np.nan))))

In [32]:
# Confusion matrix
counts = df['PREDICTION_TYPE'].value_counts()
counts

PREDICTION_TYPE
tn    7805714
fn      84544
fp      63401
tp       7791
Name: count, dtype: int64

In [33]:
# Extract counts for tp, fp, fn, and tn
TP = counts.get('tp', 0)
FP = counts.get('fp', 0)
FN = counts.get('fn', 0)
TN = counts.get('tn', 0)

# Calculate recall and precision
recall = TP / (TP + FN)
precision = TP / (TP + FP)

print(f"Recall: {recall}")
print(f"Precision: {precision}")

Recall: 0.08437753831158282
Precision: 0.10943645353410496


In [34]:
# Confusion matrix for 2023
counts = df[df['YEAR']==2023]['PREDICTION_TYPE'].value_counts()
counts

PREDICTION_TYPE
tn    798764
fn      8789
fp      7184
tp       883
Name: count, dtype: int64

In [35]:
# Extract counts for tp, fp, fn, and tn
TP = counts.get('tp', 0)
FP = counts.get('fp', 0)
FN = counts.get('fn', 0)
TN = counts.get('tn', 0)

# Calculate recall and precision for 2023
recall = TP / (TP + FN)
precision = TP / (TP + FP)

print(f"Recall: {recall}")
print(f"Precision: {precision}")

Recall: 0.0912944582299421
Precision: 0.10945828684765092


In [36]:
df_sub = df.sample(frac=0.001, random_state=1234)

In [37]:
# Get a color palette
palette = px.colors.qualitative.Plotly

## Descriptive Analysis

In [38]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CORPORATE_DEVISION').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CORPORATE_DEVISION').size().reset_index(name='damage_count')
summary = total_counts.merge(damage_counts, on='CORPORATE_DEVISION', how='left').fillna(0)
summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['devision_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by='total_count', ascending=True).reset_index(drop=True)

# Create a subplot
fig_division_1 = make_subplots(rows=2, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Division Sizes", "Probability of Damage per Division"))

# Add horizontal bar chart for division size
fig_division_1.add_trace(go.Bar(
    y=summary['CORPORATE_DEVISION'],
    x=summary['total_count'],
    orientation='h',
    name='Division Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Add bar chart for percentage of damage
fig_division_1.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Update layout
fig_division_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Division Sizes and Probability of Damage",
                  yaxis_title="Total Size of Division",
                  xaxis2_title="Corporate Division",
                  yaxis2_title="Probability of Damage")

fig_division_1.show()

In [26]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CORPORATE_DEVISION').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CORPORATE_DEVISION').size().reset_index(name='damage_count')
pred_prob = df.groupby('CORPORATE_DEVISION')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='CORPORATE_DEVISION', how='left').fillna(0)
summary = summary.merge(pred_prob, on='CORPORATE_DEVISION', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['devision_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_division_2 = make_subplots(rows=3, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Division Sizes", "Probability of Damage per Division", "Predicted Probability of Damage"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['devision_percent'], palette)):
    fig_division_2.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['CORPORATE_DEVISION'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_division_2.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Corporate Division')
)

# Plot 2

# Add bar chart for percentage of damage
fig_division_2.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['CORPORATE_DEVISION'])]),
    showlegend=False
), row=2, col=1)

# PLot 3

# Add bar chart for percentage of damage
fig_division_2.add_trace(go.Bar(
    x=summary['CORPORATE_DEVISION'],
    y=summary['pred_prob'],
    name='Predicted Probability of Damage#',
    marker=dict(color=palette[:len(summary['CORPORATE_DEVISION'])]),
    showlegend=False
), row=3, col=1)

# Update layout
fig_division_2.update_layout(height=600, width=800,
                  title_text="Division Sizes and Probability of Damage",
                  yaxis2_title="Probability",
                  yaxis3_title="Predicted Probability"
                  )

fig_division_2.show()

In [52]:
# Create a contingency table
contingency_table = pd.crosstab(df['CORPORATE_DEVISION'], df['DAMAGE'])

# Initialize dictionaries to store results
odds_ratios = {}
p_values = {}

# Iterate over each category of CONSTRUCTION_DESIGN
for design in contingency_table.index:
    # Create a 2x2 table comparing each category to the rest
    table = pd.DataFrame({
        'DAMAGE_0': [contingency_table.loc[design, 0], contingency_table.drop(design).sum()[0]],
        'DAMAGE_1': [contingency_table.loc[design, 1], contingency_table.drop(design).sum()[1]]
    })

    # Ensure it's a 2x2 table
    if table.shape == (2, 2):
        # Calculate odds ratio
        oddsratio = (table.iloc[0, 1] * table.iloc[1, 0]) / (table.iloc[0, 0] * table.iloc[1, 1])

        # Perform Fisher's Exact Test
        _, p = stats.fisher_exact(table.values)
        
        # Store the results
        odds_ratios[design] = oddsratio
        p_values[design] = p

# Display results
print("Contingency Table:")
print(contingency_table)

print("\nOdds Ratios and p-values:")
for design in odds_ratios:
    print(f"CORPORATE_DEVISION = {design}: odds ratio = {odds_ratios[design]}, p-value = {p_values[design]}")

Contingency Table:
DAMAGE                    0      1
CORPORATE_DEVISION                
H&H                  418653   8017
VGV                 1628760  61878
VHV                 5314295  20775
W&W                  507407   1665

Odds Ratios and p-values:
CORPORATE_DEVISION = H&H: odds ratio = 1.6920787963943345, p-value = 0.0
CORPORATE_DEVISION = VGV: odds ratio = 7.783973453160334, p-value = 0.0
CORPORATE_DEVISION = VHV: odds ratio = 0.13956783739420156, p-value = 0.0
CORPORATE_DEVISION = W&W: odds ratio = 0.26642363851179607, p-value = 0.0


In [40]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('Bundesland').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('Bundesland').size().reset_index(name='damage_count')
pred_prob = df.groupby('Bundesland')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='Bundesland', how='left').fillna(0)
summary = summary.merge(pred_prob, on='Bundesland', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['bundesland_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by='total_count', ascending=True).reset_index(drop=True)

# Create a subplot
fig_bundesland_1 = make_subplots(rows=3, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Bundesland Sizes", "Probability of Damage per Bundesland", "Predicted Probability of Damage per Bundesland"))

# Plot 1
fig_bundesland_1.add_trace(go.Bar(
    y=summary['Bundesland'],
    x=summary['total_count'],
    orientation='h',
    name='Bundesland Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Plot 2
fig_bundesland_1.add_trace(go.Bar(
    x=summary['damage_prob'],
    y=summary['Bundesland'],
    orientation='h',
    name='Probability'
), row=2, col=1)

# Plot 3
fig_bundesland_1.add_trace(go.Bar(
    x=summary['pred_prob'],
    y=summary['Bundesland'],
    orientation='h',
    name='Predicted Probability'
), row=3, col=1)

# Update layout
fig_bundesland_1.update_layout(height=1000, width=800, showlegend=False, 
                  title_text="Division Sizes and Probability of Damage",
                  xaxis2_title="Probability",
                  xaxis3_title="Predicted Probability",
                  margin=dict(l=150),  # Increase left margin for longer labels
                  yaxis=dict(tickmode='linear'),  # Ensure all ticks are shown
                  yaxis2=dict(tickmode='linear'),
                  yaxis3=dict(tickmode='linear') 
                  )

fig_bundesland_1.show()

In [41]:
summary

Unnamed: 0,Bundesland,total_count,damage_count,pred_prob,damage_prob,bundesland_percent
0,Nordrhein-Westfalen,2014185,30084,0.096048,0.014936,0.252992
1,Bayern,966706,8556,0.064985,0.008851,0.121423
2,Niedersachsen,865996,11299,0.095424,0.013047,0.108774
3,Baden-Württemberg,814995,8083,0.083115,0.009918,0.102368
4,Hessen,575224,7170,0.092405,0.012465,0.072251
5,Rheinland-Pfalz,442380,5700,0.101688,0.012885,0.055565
6,Sachsen,339979,2267,0.055899,0.006668,0.042703
7,Berlin,336304,4517,0.07838,0.013431,0.042242
8,Schleswig-Holstein,335840,3626,0.084099,0.010797,0.042183
9,Brandenburg,268205,2031,0.065845,0.007573,0.033688


In [20]:
# PFilter years
df_years = df[(df['CONSTRUCTION_YEAR'] >= 1900) & (df['CONSTRUCTION_YEAR'] <= 2024)]

# Calculate the total number of observations and the number of damages for each year
total_counts = df_years.groupby('CONSTRUCTION_YEAR').size().reset_index(name='total_count')
damage_counts = df_years[df_years['DAMAGE'] == 1].groupby('CONSTRUCTION_YEAR').size().reset_index(name='damage_count')
pred_prob = df_years.groupby('CONSTRUCTION_YEAR')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='CONSTRUCTION_YEAR', how='left').fillna(0)
summary = summary.merge(pred_prob, on='CONSTRUCTION_YEAR', how='left').fillna(0)

summary['damage_prob'] = summary['damage_count'] / summary['total_count']

# Filter to only show results every 5 years
summary = summary[summary['CONSTRUCTION_YEAR'] % 5 == 0]

# Create a subplot
fig_year_1 = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                    vertical_spacing=0.1, subplot_titles=("Probability of Damage over Construction Years", "Predicted Probability of Damage over Construction Years"))

# Plot 1
fig_year_1.add_trace(go.Scatter(
    x=summary['CONSTRUCTION_YEAR'],
    y=summary['damage_prob'],
    mode='lines',
    name='Probability of Damage'
), row=1, col=1)

# Plot 2
fig_year_1.add_trace(go.Scatter(
    x=summary['CONSTRUCTION_YEAR'],
    y=summary['pred_prob'],
    mode='lines',
    name='Predicted Probability of Damage'
), row=2, col=1)

# Update layout
fig_year_1.update_layout(
    title='Predicted Probability of Damage over Construction Years',
    xaxis_title='Construction Year',
    yaxis_title='Probability',
    yaxis2_title='Predicted Probability',
    xaxis=dict(range=[1900, 2024])
)

fig_year_1.show()

In [21]:
# Calculate the percentage for each century
century_counts = df['century'].value_counts(normalize=True).sort_index() * 100

# Combine the count and percentage into a single DataFrame
century_stats = pd.concat([df['century'].value_counts().sort_index(), century_counts], axis=1)
century_stats.columns = ['counts', 'percentage']

century_stats.tail(2)

Unnamed: 0_level_0,counts,percentage
century,Unnamed: 1_level_1,Unnamed: 2_level_1
20,7536990,94.668559
21,373723,4.694157


In [43]:
# Apply log transformation to wfl
df_sub_wfl = df_sub.copy()
df_sub_wfl['LOG_WFL'] = np.log(df['WFL'] + 1)  # Adding 1 to avoid log(0)

# Violin plot with log-transformed wfl
fig_wfl_1 = px.violin(df_sub_wfl,
                x='LOG_WFL',
                y='DAMAGE',
                orientation='h',
                title='Living Space vs Damage (Log-transformed - Subset)',
                labels={'DAMAGE':'Damage', 'LOG_WFL':'Living Space in sqm (Log-Transformed)'}
                )

# Adding the mean
fig_wfl_1.update_traces(meanline_visible=True)
fig_wfl_1.show()

In [44]:
# Apply log transformation to wfl
df_sub_wfl = df_sub.copy()
df_sub_wfl['LOG_WFL'] = np.log(df['WFL'] + 1)  # Adding 1 to avoid log(0)

# Violin plot with log-transformed wfl
fig_wfl_2 = px.violin(df_sub_wfl,
                x='LOG_WFL',
                y='PREDICTION',
                orientation='h',
                title='Living Space vs Damage Prediction (Log-transformed - Subset)',
                labels={'PREDICTION':'Damage Prediction', 'LOG_WFL':'Living Space in sqm (Log-Transformed)'}
                )

# Adding the mean
fig_wfl_2.update_traces(meanline_visible=True)
fig_wfl_2.show()

In [45]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('ZONE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('ZONE').size().reset_index(name='damage_count')
pred_prob = df.groupby('ZONE')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='ZONE', how='left').fillna(0)
summary = summary.merge(pred_prob, on='ZONE', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['zone_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['ZONE'], ascending=False).reset_index(drop=True)

# Create a subplot
fig_zone_1 = make_subplots(rows=3, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Zone Sizes", "Probability of Damage per Zone", "Predicted Probability of Damage per Zone"))

# Plot 1
fig_zone_1.add_trace(go.Bar(
    x=summary['total_count'],
    y=summary['ZONE'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
# summary = summary.sort_values(by=['ZONE'], ascending=True).reset_index(drop=True)

# Plot 2
fig_zone_1.add_trace(go.Bar(
    x=summary['damage_prob'],
    y=summary['ZONE'],
    orientation='h',
    name='Probability of Damage'
), row=2, col=1)

# Plot 3
fig_zone_1.add_trace(go.Bar(
    x=summary['pred_prob'],
    y=summary['ZONE'],
    orientation='h',
    name='Predicted Probability of Damage'
), row=3, col=1)

# Update layout
fig_zone_1.update_layout(height=1100, width=800, showlegend=False, 
                  title_text="Zone Sizes and Probability of Damage",
                  xaxis2_title="Probability",
                  xaxis3_title="Predicted Probability",
                  yaxis=dict(tickmode='linear'),  # Ensure all ticks are shown
                  yaxis2=dict(tickmode='linear'),
                  yaxis3=dict(tickmode='linear') 
                  )

fig_zone_1.show()

In [48]:
# Create a contingency table
contingency_table = pd.crosstab(df['ZONE'], df['DAMAGE'])

# Initialize dictionaries to store results
odds_ratios = {}
p_values = {}

# Iterate over each category of ZONE
for zone in contingency_table.index:
    # Create a 2x2 table comparing each category to the rest
    table = pd.DataFrame({
        'DAMAGE_0': [contingency_table.loc[zone, 0], contingency_table.drop(zone).sum()[0]],
        'DAMAGE_1': [contingency_table.loc[zone, 1], contingency_table.drop(zone).sum()[1]]
    })

    # Ensure it's a 2x2 table
    if table.shape == (2, 2):
        # Calculate odds ratio
        oddsratio = (table.iloc[0, 1] * table.iloc[1, 0]) / (table.iloc[0, 0] * table.iloc[1, 1])

        # Perform Fisher's Exact Test
        _, p = stats.fisher_exact(table.values)
        
        # Store the results
        odds_ratios[zone] = oddsratio
        p_values[zone] = p

# Display results
print("Contingency Table:")
print(contingency_table)

print("\nOdds Ratios and p-values:")
for zone in odds_ratios:
    print(f"ZONE = {zone}: odds ratio = {odds_ratios[zone]}, p-value = {p_values[zone]}")

Contingency Table:
DAMAGE        0      1
ZONE                  
0         25105   1499
0.0       91159    561
1       1310292   9861
1.0      431991   1592
2       1585398  20484
2.0      536213   5191
3       1083977  19378
3.0      389657   6481
4        589654  11550
4.0      183343   3280
5        575211   2991
5.0      162367    636
6         49811    194
6.0       21967     64
7         53111    240
7.0       22165     50
8         20497     97
8.0        8014     32
A        196924   1589
B        185739   2325
C        155677   2078
D        120844   1828
E         69999    334

Odds Ratios and p-values:
ZONE = 0: odds ratio = 5.156102522560383, p-value = 0.0
ZONE = 0.0: odds ratio = 0.5215658284562653, p-value = 1.46272223433946e-65
ZONE = 1: odds ratio = 0.5984966473130721, p-value = 0.0
ZONE = 1.0: odds ratio = 0.3020370737134611, p-value = 0.0
ZONE = 2: odds ratio = 1.1299526935676847, p-value = 6.7158550479763e-52
ZONE = 2.0: odds ratio = 0.8146143952501694, p-value = 5.6

In [46]:
# Violin plot
fig_rain_1 = px.violin(df_sub,
                x='DAMAGE_HEAVY_RAIN_ZONE',
                y='DAMAGE',
                orientation='h',
                title='Heavy Rain Zone vs Damage (Subset)',
                labels={'DAMAGE':'Damage', 'DAMAGE_HEAVY_RAIN_ZONE':'Heavy Rain Zone'}
                )

# Adding the mean
fig_rain_1.update_traces(meanline_visible=True)
fig_rain_1.show()

In [70]:
# Violin plot
fig_rain_2 = px.violin(df_sub,
                x='DAMAGE_HEAVY_RAIN_ZONE',
                y='PREDICTION',
                orientation='h',
                title='Heavy Rain Zone vs Predicted Damage (Subset)',
                labels={'PREDICTION':'Predicted Damage', 'DAMAGE_HEAVY_RAIN_ZONE':'Heavy Rain Zone'}
                )

# Adding the mean
fig_rain_2.update_traces(meanline_visible=True)
fig_rain_2.show()

In [71]:
# Categorize DAMAGE_HEAVY_RAIN_ZONE
df['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'] = pd.cut(df['DAMAGE_HEAVY_RAIN_ZONE'], bins=[-np.inf, 1.5, 2.5, np.inf], labels=[1, 2, 3])

# Calculate the total number of observations and the percentage of damage by DAMAGE_HEAVY_RAIN_ZONE_CATEGORY
total_counts_rain = df.groupby('DAMAGE_HEAVY_RAIN_ZONE_CATEGORY').size().reset_index(name='total_count')
damage_counts_rain = df[df['DAMAGE'] == 1].groupby('DAMAGE_HEAVY_RAIN_ZONE_CATEGORY').size().reset_index(name='damage_count')
pred_prob_rain = df.groupby('DAMAGE_HEAVY_RAIN_ZONE_CATEGORY')['Probability'].mean().reset_index(name='pred_prob')

summary_rain = total_counts_rain.merge(damage_counts_rain, on='DAMAGE_HEAVY_RAIN_ZONE_CATEGORY', how='left')
summary_rain = summary_rain.merge(pred_prob_rain, on='DAMAGE_HEAVY_RAIN_ZONE_CATEGORY', how='left')

summary_rain['damage_prob'] = (summary_rain['damage_count'] / summary_rain['total_count'])
summary_rain['line_percent'] = (summary_rain['total_count'] / summary_rain['total_count'].sum()) * 100

# Sort summary by the size of the division
summary_rain = summary_rain.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_rain_3 = make_subplots(rows=3, cols=1, vertical_spacing=0.1, shared_xaxes=False,
                           subplot_titles=("DAMAGE_HEAVY_RAIN_ZONE Sizes", "Probability of Damage per Zone", "Predicted Probability of Damage per Zone"))

# Define a color palette for the plots
palette = ['#636EFA', '#EF553B', '#00CC96']

# Plot 1
# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary_rain['line_percent'], palette)):
    fig_rain_3.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=str(summary_rain['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'][i]),
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_rain_3.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='DAMAGE_HEAVY_RAIN_ZONE')
)

# Plot 2
# Add bar chart for percentage of damage
fig_rain_3.add_trace(go.Bar(
    x=summary_rain['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'],
    y=summary_rain['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary_rain['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'])]),
    showlegend=False
), row=2, col=1)

# Plot 3
# Add bar chart for predicted probability of damage
fig_rain_3.add_trace(go.Bar(
    x=summary_rain['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'],
    y=summary_rain['pred_prob'],
    name='Predicted Probability of Damage',
    marker=dict(color=palette[:len(summary_rain['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'])]),
    showlegend=False
), row=3, col=1)

# Update layout
fig_rain_3.update_layout(height=600, width=800,
                         title_text="DAMAGE_HEAVY_RAIN_ZONE and Probability of Damage",
                         yaxis2_title="Probability",
                         yaxis3_title="Predicted Probability",
                         xaxis3_title="DAMAGE_HEAVY_RAIN_ZONE"
                         )

fig_rain_3.show()









In [48]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('TYPE_OF_DEDUCTIBLE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('TYPE_OF_DEDUCTIBLE').size().reset_index(name='damage_count')
pred_prob = df.groupby('TYPE_OF_DEDUCTIBLE')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='TYPE_OF_DEDUCTIBLE', how='left').fillna(0)
summary = summary.merge(pred_prob, on='TYPE_OF_DEDUCTIBLE', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['type_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['TYPE_OF_DEDUCTIBLE'], ascending=False).reset_index(drop=True)

# Create a subplot
fig_deductible_1 = make_subplots(rows=3, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Type of Deductible Size", "Probability of Damage per Type of Deductible",
                                                          "Predicted Probability of Damage per Type of Deductible"))

# Plot 1
fig_deductible_1.add_trace(go.Bar(
    y=summary['TYPE_OF_DEDUCTIBLE'],
    x=summary['total_count'],
    orientation='h',
    name='Zone Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['TYPE_OF_DEDUCTIBLE'], ascending=True).reset_index(drop=True)

# Plot 2
fig_deductible_1.add_trace(go.Bar(
    x=summary['TYPE_OF_DEDUCTIBLE'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Plot 3
fig_deductible_1.add_trace(go.Bar(
    x=summary['TYPE_OF_DEDUCTIBLE'],
    y=summary['pred_prob'],
    name='Predicted Probability of Damage'
), row=3, col=1)

# Update layout
fig_deductible_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Type of Deductible and Probability of Damage",
                  yaxis2_title="Probability",
                  yaxis3_title="Predicted Probability"
                  )

fig_deductible_1.show()

In [49]:
summary

Unnamed: 0,TYPE_OF_DEDUCTIBLE,total_count,damage_count,pred_prob,damage_prob,type_percent
0,0,7934631,92229.0,0.083734,0.011624,0.996631
1,2,8,0.0,0.0617,0.0,1e-06
2,3,21895,83.0,0.047399,0.003791,0.00275
3,4,3180,14.0,0.049551,0.004403,0.000399
4,5,1736,9.0,0.04824,0.005184,0.000218


In [50]:
# Calculate the total number of observations and the percentage of damage by category
total_counts = df.groupby('PRIOR_DAMAGES').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('PRIOR_DAMAGES').size().reset_index(name='damage_count')
pred_prob = df.groupby('PRIOR_DAMAGES')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='PRIOR_DAMAGES', how='left').fillna(0)
summary = summary.merge(pred_prob, on='PRIOR_DAMAGES', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['type_percent'] = (summary['total_count'] / summary['total_count'].sum())

# Sort summary
summary = summary.sort_values(by=['PRIOR_DAMAGES'], ascending=False).reset_index(drop=True)

# Create a subplot
fig_prior_1 = make_subplots(rows=3, cols=1, shared_xaxes=False, 
                    vertical_spacing=0.1, subplot_titles=("Prior Damage Sizes", "Probability of Damage per Prior Damages",
                                                          "Predicted Probability of Damage per Prior Damages"))

# Plot 1
fig_prior_1.add_trace(go.Bar(
    y=summary['PRIOR_DAMAGES'],
    x=summary['total_count'],
    orientation='h',
    name='Prior Damage Size'
), row=1, col=1)

# Sort summary
summary = summary.sort_values(by=['PRIOR_DAMAGES'], ascending=True).reset_index(drop=True)

# Plot 2
fig_prior_1.add_trace(go.Bar(
    x=summary['PRIOR_DAMAGES'],
    y=summary['damage_prob'],
    name='Probability of Damage'
), row=2, col=1)

# Plot 3
fig_prior_1.add_trace(go.Bar(
    x=summary['PRIOR_DAMAGES'],
    y=summary['pred_prob'],
    name='Predicted Probability of Damage'
), row=3, col=1)

# Update layout
fig_prior_1.update_layout(height=600, width=800, showlegend=False, 
                  title_text="Prior Damage and Probability of Damage",
                  yaxis2_title="Probability",
                  yaxis3_title="Predicted Probability",
                  xaxis2=dict(tickmode='linear'),  # Ensure all ticks are shown
                  xaxis3=dict(tickmode='linear'),
                  yaxis1=dict(tickmode='linear'),
                  )

fig_prior_1.show()

In [51]:
summary

Unnamed: 0,PRIOR_DAMAGES,total_count,damage_count,pred_prob,damage_prob,type_percent
0,0,7905853,88330,0.082773,0.011173,0.993017
1,1,42378,2639,0.194227,0.062273,0.005323
2,2,10308,925,0.217868,0.089736,0.001295
3,3,1932,249,0.260267,0.128882,0.000243
4,4,614,85,0.281211,0.138436,7.7e-05
5,5,166,25,0.247673,0.150602,2.1e-05
6,6,102,39,0.42749,0.382353,1.3e-05
7,7,28,13,0.48867,0.464286,4e-06
8,8,35,11,0.393986,0.314286,4e-06
9,9,34,19,0.528159,0.558824,4e-06


In [43]:
# Fisher test and odd-ratio

# Create a contingency table
contingency_table = pd.crosstab(df['PRIOR_DAMAGES'], df['DAMAGE'])

# Initialize dictionaries to store results
odds_ratios = {}
p_values = {}

# Iterate over each category of PRIOR_DAMAGES
for prior_damage in contingency_table.index:
    # Create a 2x2 table comparing each category to the rest
    table = pd.DataFrame({
        'DAMAGE_0': [contingency_table.loc[prior_damage, 0], contingency_table.drop(prior_damage).sum()[0]],
        'DAMAGE_1': [contingency_table.loc[prior_damage, 1], contingency_table.drop(prior_damage).sum()[1]]
    })

    # Ensure it's a 2x2 table
    if table.shape == (2, 2):
        # Calculate odds ratio
        oddsratio = (table.iloc[0, 1] * table.iloc[1, 0]) / (table.iloc[0, 0] * table.iloc[1, 1])

        # Perform Fisher's Exact Test
        _, p = stats.fisher_exact(table.values)
        
        # Store the results
        odds_ratios[prior_damage] = oddsratio
        p_values[prior_damage] = p

# Display results
print("Contingency Table:")
print(contingency_table)

print("\nOdds Ratios and p-values:")
for prior_damage in odds_ratios:
    print(f"PRIOR_DAMAGES = {prior_damage}: odds ratio = {odds_ratios[prior_damage]}, p-value = {p_values[prior_damage]}")

Contingency Table:
DAMAGE               0      1
PRIOR_DAMAGES                
0              7817523  88330
1                39739   2639
2                 9383    925
3                 1683    249
4                  529     85
5                  141     25
6                   63     39
7                   15     13
8                   24     11
9                   15     19

Odds Ratios and p-values:
PRIOR_DAMAGES = 0: odds ratio = 0.14555224429504746, p-value = 0.0
PRIOR_DAMAGES = 1: odds ratio = 5.796642679642354, p-value = 0.0
PRIOR_DAMAGES = 2: odds ratio = 8.476450793641689, p-value = 0.0
PRIOR_DAMAGES = 3: odds ratio = 12.640219638133352, p-value = 1.8269106363031414e-170
PRIOR_DAMAGES = 4: odds ratio = 13.705458681051839, p-value = 5.067789758693376e-62
PRIOR_DAMAGES = 5: odds ratio = 15.114377164211557, p-value = 2.5568531950096836e-20
PRIOR_DAMAGES = 6: odds ratio = 52.77929601241554, p-value = 3.735079044255938e-48
PRIOR_DAMAGES = 7: odds ratio = 73.87065560393694, p-value 

In [47]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('PRODUCTLINE').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('PRODUCTLINE').size().reset_index(name='damage_count')
pred_prob = df.groupby('PRODUCTLINE')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='PRODUCTLINE', how='left').fillna(0)
summary = summary.merge(pred_prob, on='PRODUCTLINE', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_line_1 = make_subplots(rows=3, cols=1, vertical_spacing=0.1, shared_xaxes=True,
                    subplot_titles=("Product Line Sizes", "Probability of Damage per Product Line", "Predicted Probability of Damage per Product Line"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig_line_1.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['PRODUCTLINE'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_line_1.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Product Line')
)

# Plot 2

# Add bar chart for percentage of damage
fig_line_1.add_trace(go.Bar(
    x=summary['PRODUCTLINE'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['PRODUCTLINE'])]),
    showlegend=False
), row=2, col=1)

# Plot 3

# Add bar chart for percentage of damage
fig_line_1.add_trace(go.Bar(
    x=summary['PRODUCTLINE'],
    y=summary['pred_prob'],
    name='Predicted Probability of Damage',
    marker=dict(color=palette[:len(summary['PRODUCTLINE'])]),
    showlegend=False
), row=3, col=1)

# Update layout
fig_line_1.update_layout(height=600, width=800,
                  title_text="Product Line and Probability of Damage",
                  xaxis3_title="Product Line",
                  yaxis2_title="Probability",
                  yaxis3_title="Predicted Probability"
                  )

fig_line_1.show()

In [46]:
# Create a contingency table
contingency_table = pd.crosstab(df['PRODUCTLINE'], df['DAMAGE'])

# Initialize dictionaries to store results
odds_ratios = {}
p_values = {}

# Iterate over each category of PRODUCT_LINE
for product_line in contingency_table.index:
    # Create a 2x2 table comparing each category to the rest
    table = pd.DataFrame({
        'DAMAGE_0': [contingency_table.loc[product_line, 0], contingency_table.drop(product_line).sum()[0]],
        'DAMAGE_1': [contingency_table.loc[product_line, 1], contingency_table.drop(product_line).sum()[1]]
    })

    # Ensure it's a 2x2 table
    if table.shape == (2, 2):
        # Calculate odds ratio
        oddsratio = (table.iloc[0, 1] * table.iloc[1, 0]) / (table.iloc[0, 0] * table.iloc[1, 1])

        # Perform Fisher's Exact Test
        _, p = stats.fisher_exact(table.values)
        
        # Store the results
        odds_ratios[product_line] = oddsratio
        p_values[product_line] = p

# Display results
print("Contingency Table:")
print(contingency_table)

print("\nOdds Ratios and p-values:")
for product_line in odds_ratios:
    print(f"PRODUCT_LINE = {product_line}: odds ratio = {odds_ratios[product_line]}, p-value = {p_values[product_line]}")

Contingency Table:
DAMAGE             0      1
PRODUCTLINE                
Basis         450591   2283
Kompakt       234644  10266
Plus          299777   7022
Premium       174226   4570
Sonst        1823896   6208
Top          3574644  50272
UNKNOWN      1311337  11714

Odds Ratios and p-values:
PRODUCT_LINE = Basis: odds ratio = 0.4173952878891009, p-value = 0.0
PRODUCT_LINE = Kompakt: odds ratio = 4.069973809598538, p-value = 0.0
PRODUCT_LINE = Plus: odds ratio = 2.0782854664300263, p-value = 0.0
PRODUCT_LINE = Premium: odds ratio = 2.299769110605957, p-value = 0.0
PRODUCT_LINE = Sonst: odds ratio = 0.23890451420669456, p-value = 0.0
PRODUCT_LINE = Top: odds ratio = 1.4358292520177751, p-value = 0.0
PRODUCT_LINE = UNKNOWN: odds ratio = 0.726606760328618, p-value = 9.231118892765034e-245


In [53]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('UNDERWRITER').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('UNDERWRITER').size().reset_index(name='damage_count')
pred_prob = df.groupby('UNDERWRITER')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='UNDERWRITER', how='left').fillna(0)
summary = summary.merge(pred_prob, on='UNDERWRITER', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_underwriter_1 = make_subplots(rows=3, cols=1, vertical_spacing=0.1, shared_xaxes=False,
                    subplot_titles=("Underwriter Sizes", "Probability of Damage per Underwriter", "Predicted Probability of Damage per Underwriter"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig_underwriter_1.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['UNDERWRITER'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_underwriter_1.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Underwriter')
)

# Plot 2

# Add bar chart for percentage of damage
fig_underwriter_1.add_trace(go.Bar(
    x=summary['UNDERWRITER'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['UNDERWRITER'])]),
    showlegend=False
), row=2, col=1)

# Plot 3

# Add bar chart for percentage of damage
fig_underwriter_1.add_trace(go.Bar(
    x=summary['UNDERWRITER'],
    y=summary['pred_prob'],
    name='Predicted Probability of Damage',
    marker=dict(color=palette[:len(summary['UNDERWRITER'])]),
    showlegend=False
), row=3, col=1)

# Update layout
fig_underwriter_1.update_layout(height=600, width=800,
                  title_text="Underwriter and Probability of Damage",
                  yaxis2_title="Probability",
                  yaxis3_title="Predicted Probability",
                  xaxis3_title="Underwriter"
                  )

fig_underwriter_1.show()

In [54]:
summary

Unnamed: 0,UNDERWRITER,total_count,damage_count,pred_prob,damage_prob,line_percent
0,Y,7961059,92117,0.08359,0.011571,99.995089
1,N,391,218,0.544735,0.557545,0.004911


In [55]:
# Calculate the total number of observations and the percentage of damage by corporate division
total_counts = df.groupby('CONSTRACTION_DESIGN').size().reset_index(name='total_count')
damage_counts = df[df['DAMAGE'] == 1].groupby('CONSTRACTION_DESIGN').size().reset_index(name='damage_count')
pred_prob = df.groupby('CONSTRACTION_DESIGN')['Probability'].mean().reset_index(name='pred_prob')

summary = total_counts.merge(damage_counts, on='CONSTRACTION_DESIGN', how='left').fillna(0)
summary = summary.merge(pred_prob, on='CONSTRACTION_DESIGN', how='left').fillna(0)

summary['damage_prob'] = (summary['damage_count'] / summary['total_count'])
summary['line_percent'] = (summary['total_count'] / summary['total_count'].sum()) * 100

# Sort summary by the size of the division
summary = summary.sort_values(by='total_count', ascending=False).reset_index(drop=True)

# Create a subplot
fig_design_1 = make_subplots(rows=3, cols=1, vertical_spacing=0.1, shared_xaxes=False,
                    subplot_titles=("Constraction Design Sizes", "Probability of Damage per Constraction Design", "Predicted Probability of Damage per Constraction Design"))

# Plot 1

# Create the bar segments
for i, (percentage, color) in enumerate(zip(summary['line_percent'], palette)):
    fig_design_1.add_trace(go.Bar(
        x=[percentage],
        y=[''],
        name=summary['CONSTRACTION_DESIGN'][i],
        orientation='h',
        marker=dict(color=color),
        text=f"{percentage:.2f}%",
        textposition='inside',
        showlegend=True
    ), row=1, col=1)

# Update layout to remove gaps and set y-axis
fig_design_1.update_layout(
    barmode='stack',
    xaxis=dict(range=[0, 100]),
    showlegend=True,
    yaxis=dict(showticklabels=False),
    legend=dict(title='Constraction Design')
)

# Plot 2

# Add bar chart for percentage of damage
fig_design_1.add_trace(go.Bar(
    x=summary['CONSTRACTION_DESIGN'],
    y=summary['damage_prob'],
    name='Probability of Damage',
    marker=dict(color=palette[:len(summary['CONSTRACTION_DESIGN'])]),
    showlegend=False
), row=2, col=1)

# Plot 3

# Add bar chart for percentage of damage
fig_design_1.add_trace(go.Bar(
    x=summary['CONSTRACTION_DESIGN'],
    y=summary['pred_prob'],
    name='Predicted Probability of Damage',
    marker=dict(color=palette[:len(summary['CONSTRACTION_DESIGN'])]),
    showlegend=False
), row=3, col=1)

# Update layout
fig_design_1.update_layout(height=1000, width=800,
                  title_text="Constraction Design and Probability of Damage",
                  yaxis2_title="Probability",
                  yaxis3_title="Predicted Probability",
                  xaxis3_title="Constraction Design"
                  )

fig_design_1.show()

In [51]:
# Create a contingency table
contingency_table = pd.crosstab(df['CONSTRACTION_DESIGN'], df['DAMAGE'])

# Initialize dictionaries to store results
odds_ratios = {}
p_values = {}

# Iterate over each category of CONSTRUCTION_DESIGN
for design in contingency_table.index:
    # Create a 2x2 table comparing each category to the rest
    table = pd.DataFrame({
        'DAMAGE_0': [contingency_table.loc[design, 0], contingency_table.drop(design).sum()[0]],
        'DAMAGE_1': [contingency_table.loc[design, 1], contingency_table.drop(design).sum()[1]]
    })

    # Ensure it's a 2x2 table
    if table.shape == (2, 2):
        # Calculate odds ratio
        oddsratio = (table.iloc[0, 1] * table.iloc[1, 0]) / (table.iloc[0, 0] * table.iloc[1, 1])

        # Perform Fisher's Exact Test
        _, p = stats.fisher_exact(table.values)
        
        # Store the results
        odds_ratios[design] = oddsratio
        p_values[design] = p

# Display results
print("Contingency Table:")
print(contingency_table)

print("\nOdds Ratios and p-values:")
for design in odds_ratios:
    print(f"CONSTRACTION_DESIGN = {design}: odds ratio = {odds_ratios[design]}, p-value = {p_values[design]}")

Contingency Table:
DAMAGE                           0      1
CONSTRACTION_DESIGN                      
0                             1230      2
CARAVAN_MOTORHOME               96      1
DESIGN_CLASS_I               66275    230
DESIGN_CLASS_II                 39      0
DESIGN_CLASS_III               125      1
DESIGN_CLASS_IV                112      2
DESIGN_CLASS_V                  33      0
NORMAL_VENTURE             6872480  84029
PREDOMINANTLY_WOODEN_ROOF    13103     35
PREFAB_HOUSE                 59232   1104
UNKNOWN                     856390   6931

Odds Ratios and p-values:
CONSTRACTION_DESIGN = 0: odds ratio = 0.1385561927272961, p-value = 0.00014515417611884546
CONSTRACTION_DESIGN = CARAVAN_MOTORHOME: odds ratio = 0.8877439287441968, p-value = 1.0
CONSTRACTION_DESIGN = DESIGN_CLASS_I: odds ratio = 0.29400017868672057, p-value = 8.774046591976072e-118
CONSTRACTION_DESIGN = DESIGN_CLASS_II: odds ratio = 0.0, p-value = 1.0
CONSTRACTION_DESIGN = DESIGN_CLASS_III: odds ratio = 

## Create Report

In [72]:
# Save to file
with open("result_plots.html", "w", encoding="utf-8") as f:
    f.write(
        f"<html><head></head><body> \
        {fig_division_2.to_html(full_html=False)} \
        {fig_bundesland_1.to_html(full_html=False)} \
        {fig_year_1.to_html(full_html=False)} \
        {fig_wfl_1.to_html(full_html=False)} \
        {fig_wfl_2.to_html(full_html=False)} \
        {fig_zone_1.to_html(full_html=False)} \
        {fig_rain_1.to_html(full_html=False)} \
        {fig_rain_2.to_html(full_html=False)} \
        {fig_rain_3.to_html(full_html=False)} \
        {fig_deductible_1.to_html(full_html=False)} \
        {fig_prior_1.to_html(full_html=False)} \
        {fig_line_1.to_html(full_html=False)} \
        {fig_underwriter_1.to_html(full_html=False)} \
        {fig_design_1.to_html(full_html=False)} \
        </body></html>"
    )

## Personas

Create plot for all categorical variables:

In [57]:
# Function to create summary table and subplots
def create_summary_and_plot(df, index_column, subplot_titles):
    summary = df.pivot_table(index=index_column, 
                             columns='PREDICTION_TYPE', 
                             aggfunc='size', 
                             fill_value=0)
    summary['TOTAL'] = summary.sum(axis=1)
    summary = summary.sort_values(by='TOTAL', ascending=True).reset_index()
    
    fig = make_subplots(rows=2, cols=3, shared_xaxes=False, 
                        vertical_spacing=0.3, horizontal_spacing=0.1, subplot_titles=subplot_titles)
    
    subplot_positions = {
        "tn": (1, 1),
        "fp": (1, 2),
        "fn": (2, 1),
        "tp": (2, 2),
        "TOTAL": (1, 3)
    }
    
    for col, pos in subplot_positions.items():
        if col in summary.columns:
            fig.add_trace(
                go.Bar(x=summary[index_column], y=summary[col], name=col),
                row=pos[0], col=pos[1]
            )
    
    fig.update_layout(height=1000, width=1400, title_text=f"Confusion Matrix vs {index_column.replace('_', ' ').title()}", showlegend=False)
    fig.update_xaxes(tickangle=45)
    return fig

# Create plots for each specified category and save to HTML
categories = ['CORPORATE_DEVISION', 'Bundesland', 'CONSTRACTION_DESIGN', 'ZONE', 'TYPE_OF_DEDUCTIBLE', 'PRODUCTLINE', 'PRIOR_DAMAGES', 'UNDERWRITER']
titles = ["True Negatives (tn)", "False Positives (fp)", "Total observations (TOTAL)", "False Negatives (fn)", "True Positives (tp)"]

figures = []
for category in categories:
    fig = create_summary_and_plot(df, category, titles)
    figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Combine all figures into a single HTML file
html_content = '<html><head><title>Confusion Matrix vs Various Categories</title></head><body>'
for fig_div in figures:
    html_content += fig_div
html_content += '</body></html>'

# Save the HTML content to a file
with open('confusion_matrix_plots.html', 'w') as f:
    f.write(html_content)

In [58]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot
import numpy as np

# Ensure the data types are correct
df['CORPORATE_DEVISION'] = df['CORPORATE_DEVISION'].astype(str)
df['PREDICTION_TYPE'] = df['PREDICTION_TYPE'].astype(str)
df['Bundesland'] = df['Bundesland'].astype(str)
df['CONSTRUCTION_YEAR'] = df['CONSTRUCTION_YEAR'].astype(int)

# Function to create summary table and bar chart subplots
def create_summary_and_bar_plot(df, index_column, subplot_titles):
    summary = df.pivot_table(index=index_column, 
                             columns='PREDICTION_TYPE', 
                             aggfunc='size', 
                             fill_value=0)
    summary['TOTAL'] = summary.sum(axis=1)
    summary = summary.sort_values(by='TOTAL', ascending=True).reset_index()
    
    fig = make_subplots(rows=2, cols=3, shared_xaxes=False, 
                        vertical_spacing=0.3, horizontal_spacing=0.1, subplot_titles=subplot_titles)
    
    subplot_positions = {
        "tn": (1, 1),
        "fp": (1, 2),
        "fn": (2, 1),
        "tp": (2, 2),
        "TOTAL": (1, 3)
    }
    
    for col, pos in subplot_positions.items():
        if col in summary.columns:
            fig.add_trace(
                go.Bar(x=summary[index_column], y=summary[col], name=col),
                row=pos[0], col=pos[1]
            )
    
    fig.update_layout(height=1000, width=1400, title_text=f"Confusion Matrix vs {index_column.replace('_', ' ').title()}", showlegend=False)
    fig.update_xaxes(tickangle=45)
    return fig

# Function to create summary table and line chart subplots
def create_summary_and_line_plot(df, index_column, subplot_titles, log_scale=False, every_nth=False):
    summary = df.pivot_table(index=index_column, 
                             columns='PREDICTION_TYPE', 
                             aggfunc='size', 
                             fill_value=0)
    summary['TOTAL'] = summary.sum(axis=1)
    summary = summary.sort_values(by=index_column).reset_index()

    if every_nth:
        summary = summary.iloc[::every_nth, :]
    
    fig = make_subplots(rows=2, cols=3, shared_xaxes=False, 
                        vertical_spacing=0.3, horizontal_spacing=0.1, subplot_titles=subplot_titles)
    
    subplot_positions = {
        "tn": (1, 1),
        "fp": (1, 2),
        "fn": (2, 1),
        "tp": (2, 2),
        "TOTAL": (1, 3)
    }
    
    for col, pos in subplot_positions.items():
        if col in summary.columns:
            fig.add_trace(
                go.Scatter(x=summary[index_column], y=summary[col], mode='lines', name=col),
                row=pos[0], col=pos[1]
            )
    
    if log_scale:
        fig.update_xaxes(type="log")
    
    fig.update_layout(height=1000, width=1400, title_text=f"Confusion Matrix vs {index_column.replace('_', ' ').title()}", showlegend=False)
    fig.update_xaxes(tickangle=45)
    return fig

# Categorize DAMAGE_HEAVY_RAIN_ZONE
df['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'] = pd.cut(df['DAMAGE_HEAVY_RAIN_ZONE'], bins=[-np.inf, 1.5, 2.5, np.inf], labels=[1, 2, 3])

# Create plots for bar chart categories
bar_categories = ['CORPORATE_DEVISION', 'Bundesland', 'CONSTRACTION_DESIGN', 'ZONE', 'TYPE_OF_DEDUCTIBLE', 'PRODUCTLINE', 'PRIOR_DAMAGES', 'UNDERWRITER']
titles = ["True Negatives (tn)", "False Positives (fp)", "Total observations (TOTAL)", "False Negatives (fn)", "True Positives (tp)"]

figures = []
for category in bar_categories:
    fig = create_summary_and_bar_plot(df, category, titles)
    figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Create bar chart for DAMAGE_HEAVY_RAIN_ZONE_CATEGORY
fig = create_summary_and_bar_plot(df, 'DAMAGE_HEAVY_RAIN_ZONE_CATEGORY', titles)
figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Filter CONSTRUCTION_YEAR
df_construction_year = df[df['CONSTRUCTION_YEAR'] > 1900]

# Create line chart for CONSTRUCTION_YEAR, every 5th year
fig = create_summary_and_line_plot(df_construction_year, 'CONSTRUCTION_YEAR', titles, every_nth=5)
figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Create line chart for WFL with logarithmic x-axis, every 10th value
fig = create_summary_and_line_plot(df, 'WFL', titles, log_scale=True, every_nth=10)
figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Combine all figures into a single HTML file
html_content = '<html><head><title>Confusion Matrix vs Various Categories</title></head><body>'
for fig_div in figures:
    html_content += fig_div
html_content += '</body></html>'

# Save the HTML content to a file
with open('confusion_matrix_plots.html', 'w') as f:
    f.write(html_content)





In [59]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot
import numpy as np

# Ensure the data types are correct
df['CORPORATE_DEVISION'] = df['CORPORATE_DEVISION'].astype(str)
df['PREDICTION_TYPE'] = df['PREDICTION_TYPE'].astype(str)
df['Bundesland'] = df['Bundesland'].astype(str)
df['CONSTRUCTION_YEAR'] = df['CONSTRUCTION_YEAR'].astype(int)

# Function to create summary table and bar chart subplots with percentages
def create_summary_and_bar_plot_percentage(df, index_column, subplot_titles):
    summary = df.pivot_table(index=index_column, 
                             columns='PREDICTION_TYPE', 
                             aggfunc='size', 
                             fill_value=0)
    summary['TOTAL'] = summary.sum(axis=1)
    
    # Convert counts to percentages
    for col in summary.columns:
        summary[col] = (summary[col] / summary['TOTAL']) * 100
    
    summary = summary.sort_values(by='TOTAL', ascending=True).reset_index()
    
    fig = make_subplots(rows=2, cols=3, shared_xaxes=False, 
                        vertical_spacing=0.3, horizontal_spacing=0.1, subplot_titles=subplot_titles)
    
    subplot_positions = {
        "tn": (1, 1),
        "fp": (1, 2),
        "fn": (2, 1),
        "tp": (2, 2),
        "TOTAL": (1, 3)
    }
    
    for col, pos in subplot_positions.items():
        if col in summary.columns:
            fig.add_trace(
                go.Bar(x=summary[index_column], y=summary[col], name=col),
                row=pos[0], col=pos[1]
            )
    
    fig.update_layout(height=1000, width=1400, title_text=f"Confusion Matrix vs {index_column.replace('_', ' ').title()} (Percentages)", showlegend=False)
    fig.update_xaxes(tickangle=45)
    return fig

# Function to create summary table and line chart subplots with percentages
def create_summary_and_line_plot_percentage(df, index_column, subplot_titles, log_scale=False, every_nth=False):
    summary = df.pivot_table(index=index_column, 
                             columns='PREDICTION_TYPE', 
                             aggfunc='size', 
                             fill_value=0)
    summary['TOTAL'] = summary.sum(axis=1)
    
    # Convert counts to percentages
    for col in summary.columns:
        summary[col] = (summary[col] / summary['TOTAL']) * 100
    
    summary = summary.sort_values(by=index_column).reset_index()

    if every_nth:
        summary = summary.iloc[::every_nth, :]
    
    fig = make_subplots(rows=2, cols=3, shared_xaxes=False, 
                        vertical_spacing=0.3, horizontal_spacing=0.1, subplot_titles=subplot_titles)
    
    subplot_positions = {
        "tn": (1, 1),
        "fp": (1, 2),
        "fn": (2, 1),
        "tp": (2, 2),
        "TOTAL": (1, 3)
    }
    
    for col, pos in subplot_positions.items():
        if col in summary.columns:
            fig.add_trace(
                go.Scatter(x=summary[index_column], y=summary[col], mode='lines', name=col),
                row=pos[0], col=pos[1]
            )
    
    if log_scale:
        fig.update_xaxes(type="log")
    
    fig.update_layout(height=1000, width=1400, title_text=f"Confusion Matrix vs {index_column.replace('_', ' ').title()} (Percentages)", showlegend=False)
    fig.update_xaxes(tickangle=45)
    return fig

# Categorize DAMAGE_HEAVY_RAIN_ZONE
df['DAMAGE_HEAVY_RAIN_ZONE_CATEGORY'] = pd.cut(df['DAMAGE_HEAVY_RAIN_ZONE'], bins=[-np.inf, 1.5, 2.5, np.inf], labels=[1, 2, 3])

# Create plots for bar chart categories
bar_categories = ['CORPORATE_DEVISION', 'Bundesland', 'CONSTRACTION_DESIGN', 'ZONE', 'TYPE_OF_DEDUCTIBLE', 'PRODUCTLINE', 'PRIOR_DAMAGES', 'UNDERWRITER']
titles = ["True Negatives (tn)", "False Positives (fp)", "Total observations (TOTAL)", "False Negatives (fn)", "True Positives (tp)"]

figures = []
for category in bar_categories:
    fig = create_summary_and_bar_plot_percentage(df, category, titles)
    figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Create bar chart for DAMAGE_HEAVY_RAIN_ZONE_CATEGORY
fig = create_summary_and_bar_plot_percentage(df, 'DAMAGE_HEAVY_RAIN_ZONE_CATEGORY', titles)
figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Filter CONSTRUCTION_YEAR
df_construction_year = df[df['CONSTRUCTION_YEAR'] > 1900]

# Create line chart for CONSTRUCTION_YEAR, every 5th year
fig = create_summary_and_line_plot_percentage(df_construction_year, 'CONSTRUCTION_YEAR', titles, every_nth=5)
figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Create line chart for WFL with logarithmic x-axis, every 10th value
fig = create_summary_and_line_plot_percentage(df, 'WFL', titles, log_scale=True, every_nth=10)
figures.append(plot(fig, output_type='div', include_plotlyjs='cdn'))

# Combine all figures into a single HTML file
html_content = '<html><head><title>Confusion Matrix vs Various Categories (Percentages)</title></head><body>'
for fig_div in figures:
    html_content += fig_div
html_content += '</body></html>'

# Save the HTML content to a file
with open('confusion_matrix_plots_percentages.html', 'w') as f:
    f.write(html_content)



