# Neural Network

## Imports

In [1]:
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from dask_ml.model_selection import train_test_split
import numpy as np
import pandas as pd
import os 

## Load the data

In [2]:
# Data Loading
# Check the current working directory
print("Current working directory:", os.getcwd())

# Define the path to your CSV file
file_path = '/Users/minarandolf/Capstone/Capstone-Project/datasets/data_all_unique_values.csv'

# Load the CSV file into a DataFrame
try:
    contracts = pd.read_csv(file_path)
    print(f"Number of observations: {len(contracts)}")
    print("Column names:")
    print(contracts.columns)
    print("Column types:")
    print(contracts.dtypes)
    print(contracts.head())
except FileNotFoundError:
    print(f"The file {file_path} does not exist in the current working directory.")

Current working directory: /Users/minarandolf/Capstone/Capstone-Project/models_scripts


  contracts = pd.read_csv(file_path)


Number of observations: 9466361
Column names:
Index(['Unnamed: 0', 'ANO_SID', 'CORPORATE_DEVISION', 'Bundesland', 'Typ',
       'ORTPLZ', 'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL', 'ZONE',
       'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED', 'PRODUCTLINE',
       'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'YEAR',
       'DAMAGE_HEAVY_RAIN_ZONE', 'LONGITUDE', 'LATITUDE', 'DAMAGE'],
      dtype='object')
Column types:
Unnamed: 0                  int64
ANO_SID                   float64
CORPORATE_DEVISION         object
Bundesland                 object
Typ                        object
ORTPLZ                      int64
CONSTRACTION_DESIGN        object
CONSTRUCTION_YEAR         float64
WFL                       float64
ZONE                       object
TYPE_OF_DEDUCTIBLE          int64
DRAIN_PIPE_INSURED          int64
PRODUCTLINE                object
PRIOR_DAMAGES               int64
UVV-KZ                      int64
UNDERWRITER                object
YEAR                        int

## Data Preparation

In [3]:
contracts = contracts.dropna()

# Convert all zone values to string (to reduce amount of unique values)
print('Original unique values:', contracts['ZONE'].nunique())
contracts['ZONE'] = contracts['ZONE'].astype('str')
print('New unique values:', contracts['ZONE'].nunique())

# Prepare binary variables
columns_to_encode = contracts.select_dtypes(include=['object']).columns
df_binary = pd.get_dummies(contracts, columns=columns_to_encode, drop_first=True)
df_binary.info()

# Print the number of observations after preprocessing
print(f"Number of observations after preprocessing: {len(df_binary)}")

Original unique values: 32
New unique values: 23
<class 'pandas.core.frame.DataFrame'>
Index: 8688392 entries, 0 to 9466360
Data columns (total 72 columns):
 #   Column                                         Dtype  
---  ------                                         -----  
 0   Unnamed: 0                                     int64  
 1   ANO_SID                                        float64
 2   ORTPLZ                                         int64  
 3   CONSTRUCTION_YEAR                              float64
 4   WFL                                            float64
 5   TYPE_OF_DEDUCTIBLE                             int64  
 6   DRAIN_PIPE_INSURED                             int64  
 7   PRIOR_DAMAGES                                  int64  
 8   UVV-KZ                                         int64  
 9   YEAR                                           int64  
 10  DAMAGE_HEAVY_RAIN_ZONE                         float64
 11  LONGITUDE                                      float64
 12

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Define the target variable and feature columns
target_column = 'DAMAGE'
feature_columns = [col for col in df_binary.columns if col != target_column]

# Split the data into features (X) and target (y)
X = df_binary[feature_columns]
y = df_binary[target_column]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Modelling

In [None]:
# Define and Train the Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the neural network model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Recall'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test)

## Model Training

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

## Evaluation

In [None]:
# Evaluate the model
loss, recall = model.evaluate(X_test, y_test)
print(f'Te# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

# Predict on the test set
y_pred_proba = model.predict(X_test).flatten()
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate additional metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
print(f'AUC: {auc:.2f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Plot the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Damage', 'Damage'], yticklabels=['No Damage', 'Damage'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
st Recall: {recall}')

# Plot training & validation recall values
import matplotlib.pyplot as plt

plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])
plt.title('Model recall')
plt.ylabel('Recall')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()