# Combine results of all models:
- K-Nearest Neighbors
- Logistic Regression
- Random Forest
- Naive Bayes

**Steps:**
- Get dataframes with predictions and id
- Merge dataframes using contract id
- Compute column `PROBABILITY_MEAN` and `PREDICTION`
- Compute column `PREDICTION_TYPE` (tp, fp, tn, fn)
- Analyze results
    - Confusion matrix
    - Precision, recall

In [1]:
import pandas as pd
import numpy as np

In [46]:
# Import data
knn = pd.read_csv('../../knn_data/knn_results_all_2.csv')
log_reg = pd.read_csv('../../predictions/predictions_lr2.csv')
rdm_forest = pd.read_csv('../../predictions/Random_Forest_with_rebalancing.csv')
naive_bayes = pd.read_csv('../../predictions/NaiveBayes.csv')

In [51]:
# Rename columns if necessary
knn.rename(columns={'PROBABILITY':'PROBABILITY_KNN'}, inplace=True)
log_reg.rename(columns={'Prediction_Probability':'PROBABILITY_LOGREG'}, inplace=True)
rdm_forest.rename(columns={'Probability':'PROBABILITY_FOREST'}, inplace=True)
naive_bayes.rename(columns={'Probability':'PROBABILITY_BAYES'}, inplace=True)

In [52]:
# Merge dataframes
results_df = pd.merge(rdm_forest, knn[['PROBABILITY_KNN', 'ANO_SID', 'YEAR']], on=['ANO_SID', 'YEAR'], how='left')
results_df = pd.merge(results_df, naive_bayes[['PROBABILITY_BAYES', 'ANO_SID', 'YEAR']], on=['ANO_SID', 'YEAR'], how='left')
results_df = pd.merge(results_df, log_reg[['PROBABILITY_LOGREG', 'ANO_SID', 'YEAR']], on=['ANO_SID', 'YEAR'], how='left')

In [54]:
# Compute average probability
results_df['PROBABILITY_MEAN'] = results_df[['PROBABILITY_KNN', 'PROBABILITY_FOREST',
                                             'PROBABILITY_LOGREG', 'PROBABILITY_BAYES']].mean(axis=1)

In [64]:
results_df[results_df['YEAR']!=2014]

Unnamed: 0,ANO_SID,CORPORATE_DEVISION,Bundesland,Typ,ORTPLZ,CONSTRACTION_DESIGN,CONSTRUCTION_YEAR,WFL,ZONE,TYPE_OF_DEDUCTIBLE,...,LONGITUDE,LATITUDE,DAMAGE,PROBABILITY_FOREST,PROBABILITY_KNN,PROBABILITY_BAYES,PROBABILITY_LOGREG,PROBABILITY_MEAN,PREDICTION,PREDICTION_TYPE
0,4114028.0,VHV,Nordrhein-Westfalen,Stadt,42109,NORMAL_VENTURE,1967.565648,69.0,2.0,0,...,7.168175,51.285086,0,0.016225,0.000000,0.553543,0.344998,0.228692,0,tn
1,4114039.0,VHV,Nordrhein-Westfalen,Stadt,42277,NORMAL_VENTURE,1967.565648,65.0,4.0,0,...,7.221409,51.282877,0,0.004122,0.000000,0.552745,0.333919,0.222697,0,tn
2,4114045.0,VHV,Nordrhein-Westfalen,Stadt,42389,DESIGN_CLASS_I,1967.565648,75.0,1.0,0,...,7.257295,51.272306,0,0.000000,0.000000,0.542979,0.353608,0.224147,0,tn
3,4114055.0,VHV,Nordrhein-Westfalen,Kreis,42553,NORMAL_VENTURE,1967.565648,119.0,2.0,0,...,7.080255,51.312158,0,0.031360,0.000000,0.555107,0.279524,0.216498,0,tn
4,4114057.0,VHV,Nordrhein-Westfalen,Stadt,42113,NORMAL_VENTURE,1967.565648,100.0,4.0,0,...,7.112926,51.266648,0,0.062878,0.000000,0.555426,0.350548,0.242213,0,tn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6678146,145240797.0,VGV,Nordrhein-Westfalen,Stadt,47807,NORMAL_VENTURE,1987.000000,115.0,4.0,0,...,6.599406,51.301652,0,0.068521,0.143704,0.085753,0.824495,0.280618,0,tn
6678147,145240866.0,VGV,Nordrhein-Westfalen,Kreis,41844,NORMAL_VENTURE,1962.000000,145.0,3.0,0,...,6.268941,51.130919,0,0.050000,0.323105,0.557772,0.723705,0.413646,0,tn
6678148,145240867.0,VHV,Sachsen,Kreis,4758,NORMAL_VENTURE,1967.565648,48.0,2.0,0,...,13.407913,52.521681,0,0.000000,0.000000,0.553456,0.123781,0.169309,0,tn
6678149,145240892.0,VHV,Niedersachsen,Kreis,26316,NORMAL_VENTURE,1967.565648,150.0,4.0,0,...,8.098933,53.400089,0,0.020524,0.179953,0.554851,0.221454,0.244196,0,tn


In [57]:
# Compute prediction
results_df['PREDICTION'] = np.where(results_df['PROBABILITY_MEAN'] < 0.5, 0, 1)

In [61]:
# Compute prediction type
results_df['PREDICTION_TYPE'] = np.where((results_df['DAMAGE'] == 1) & (results_df['PREDICTION'] == 1), 'tp',
                                   np.where((results_df['DAMAGE'] == 0) & (results_df['PREDICTION'] == 0), 'tn',
                                            np.where((results_df['DAMAGE'] == 1) & (results_df['PREDICTION'] == 0), 'fn',
                                                     np.where((results_df['DAMAGE'] == 0) & (results_df['PREDICTION'] == 1), 'fp', np.nan))))

In [62]:
# Confusion matrix
counts = results_df['PREDICTION_TYPE'].value_counts()
counts

PREDICTION_TYPE
tn    6098742
fp     373888
tp     106327
fn      99194
Name: count, dtype: int64

In [63]:
# Extract counts for tp, fp, fn, and tn
TP = counts.get('tp', 0)
FP = counts.get('fp', 0)
FN = counts.get('fn', 0)
TN = counts.get('tn', 0)

# Calculate recall and precision
recall = TP / (TP + FN)
precision = TP / (TP + FP)

print(f"Recall: {recall}")
print(f"Precision: {precision}")

Recall: 0.5173534577975
Precision: 0.2214154076819758


In [65]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6678151 entries, 0 to 6678150
Data columns (total 27 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   ANO_SID                 float64
 1   CORPORATE_DEVISION      object 
 2   Bundesland              object 
 3   Typ                     object 
 4   ORTPLZ                  int64  
 5   CONSTRACTION_DESIGN     object 
 6   CONSTRUCTION_YEAR       float64
 7   WFL                     float64
 8   ZONE                    float64
 9   TYPE_OF_DEDUCTIBLE      int64  
 10  DRAIN_PIPE_INSURED      int64  
 11  PRODUCTLINE             object 
 12  PRIOR_DAMAGES           int64  
 13  UVV-KZ                  int64  
 14  UNDERWRITER             object 
 15  YEAR                    int64  
 16  DAMAGE_HEAVY_RAIN_ZONE  float64
 17  LONGITUDE               float64
 18  LATITUDE                float64
 19  DAMAGE                  int64  
 20  PROBABILITY_FOREST      float64
 21  PROBABILITY_KNN         float64

In [67]:
results_df.to_csv('../../predictions/combined_results.csv')