# Neural Network

## Imports

In [4]:
# Install required libraries
!pip install dask[dataframe] dask-ml tensorflow

import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from dask_ml.model_selection import train_test_split
import numpy as np



In [5]:
import pandas as pd
from google.colab import drive

## Load the data

In [2]:
"""
from google.colab import drive
drive.mount('/content/drive')

# Define the file path and dtype
file_path = '/content/drive/My Drive/Capstone/Data/contract_classification.csv'
dtype = {
    'CORPORATE_DEVISION': 'category',
    'ORTS-NAME': 'category',
    'STRASSE': 'category',
    'CONSTRACTION_DESIGN': 'category',
    'ZONE': 'category',
    'PRODUCTLINE': 'category',
    'UNDERWRITER': 'category',
    'PARTY-ID': 'category'
}

# Load data in chunks
df = dd.read_csv(file_path, dtype=dtype, blocksize="64MB")

# Ensure that the categories are known
categorical_columns = list(dtype.keys())
df = df.categorize(columns=categorical_columns)
"""

Mounted at /content/drive


In [6]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the CSV file
file_path = '/content/drive/My Drive/Capstone/Data/contract_classification.csv'
df = pd.read_csv(file_path)

# Count all observations
num_observations = len(df)
print(f"Number of observations: {num_observations}")

# List all column names
column_names = df.columns
print("Column names:")
print(column_names)

# Display the types of the columns
column_types = df.dtypes
print("Column types:")
print(column_types)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file_path)


Number of observations: 11574439
Column names:
Index(['ANO_SID', 'CORPORATE_DEVISION', 'ORTPLZ', 'ORTS-NAME', 'STRASSE',
       'SUM_INSURED', 'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL',
       'ZONE', 'SF-SYSTEM', 'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED',
       'PRODUCTLINE', 'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'PARTY-ID',
       'contract_year', 'PIPE_PREMIUM_AMOUNT', 'YEAR', 'DAMAGE'],
      dtype='object')
Column types:
ANO_SID                float64
CORPORATE_DEVISION      object
ORTPLZ                 float64
ORTS-NAME               object
STRASSE                 object
SUM_INSURED            float64
CONSTRACTION_DESIGN     object
CONSTRUCTION_YEAR      float64
WFL                    float64
ZONE                    object
SF-SYSTEM              float64
TYPE_OF_DEDUCTIBLE       int64
DRAIN_PIPE_INSURED       int64
PRODUCTLINE             object
PRIOR_DAMAGES            int64
UVV-KZ                   int64
UNDERWRITER             object
PARTY-ID                objec

## Preprocess the data

In [None]:
"""
# Convert categorical columns to dummy variables
df = DummyEncoder(columns=categorical_columns).fit_transform(df)

# Handle missing values
df = df.fillna(0)

# Reduce memory usage by converting float64 to float32
float_columns = df.select_dtypes(include=['float64']).columns
df[float_columns] = df[float_columns].astype('float32')

# Convert to Dask arrays for compatibility with TensorFlow
df = df.to_dask_array(lengths=True)
"""

In [None]:
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
from dask_ml.model_selection import train_test_split

# Load the CSV file with dask and specify the dtypes for the categorical columns
file_path = '/content/drive/My Drive/Capstone/Data/contract_classification.csv'
dtype = {
    'CORPORATE_DEVISION': 'category',
    'ORTS-NAME': 'category',
    'STRASSE': 'category',
    'CONSTRACTION_DESIGN': 'category',
    'ZONE': 'category',
    'PRODUCTLINE': 'category',
    'UNDERWRITER': 'category',
    'PARTY-ID': 'category'
}
df = dd.read_csv(file_path, dtype=dtype)

# Ensure that the categories are known
df = df.categorize(columns=dtype.keys())

# Convert categorical columns to dummy variables
df = DummyEncoder(columns=dtype.keys()).fit_transform(df)

# Handle missing values
df = df.dropna()

# Split the data into features and target
X = df.drop(columns=['DAMAGE'])
y = df['DAMAGE']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute the Dask DataFrame to get a sample to fit the scaler
X_scaled = X_scaled.compute()
y = y.compute()

# Example: Split for the years 2014 (train) and 2015 (test)
def rolling_window_train_test_split(X, y, train_year, test_year):
    train_index = df[df['YEAR'] == train_year].index
    test_index = df[df['YEAR'] == test_year].index
    X_train, X_test = X_scaled.loc[train_index], X_scaled.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    return X_train, X_test, y_train, y_test

train_year = 2014
test_year = 2015
X_train, X_test, y_train, y_test = rolling_window_train_test_split(X, y, train_year, test_year)


## Modelling

In [None]:
"""
years = df['contract_year'].unique().compute()
years.sort()

recalls = []

for i in range(len(years) - 1):
    train_year = years[i]
    test_year = years[i + 1]

    # Split the data based on the year
    train_data = df[df['contract_year'] == train_year]
    test_data = df[df['contract_year'] == test_year]

    # Split the data into features and target
    X_train = train_data.drop('DAMAGE', axis=1)
    y_train = train_data['DAMAGE']
    X_test = test_data.drop('DAMAGE', axis=1)
    y_test = test_data['DAMAGE']

    # Convert to Dask arrays for compatibility with TensorFlow
    X_train = X_train.to_dask_array(lengths=True)
    X_test = X_test.to_dask_array(lengths=True)
    y_train = y_train.to_dask_array(lengths=True)
    y_test = y_test.to_dask_array(lengths=True)

    # Build the neural network
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Recall'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_test, y_test))

    # Evaluate the model
    _, recall = model.evaluate(X_test, y_test)
    recalls.append(recall)

    print(f'Trained on {train_year}, tested on {test_year}, Recall: {recall}')

# Print the average recall
average_recall = np.mean(recalls)
print(f'Average Recall: {average_recall}')
"""

In [None]:
# Define and Train the Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the neural network model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Recall'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


## Evaluation

In [None]:
"""
# Evaluate the overall performance
average_recall = np.mean(recalls)
print(f'Average Recall over all years: {average_recall}')
"""

In [None]:
# Evaluate the model
loss, recall = model.evaluate(X_test, y_test)
print(f'Test Recall: {recall}')

# Plot training & validation recall values
import matplotlib.pyplot as plt

plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])
plt.title('Model recall')
plt.ylabel('Recall')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()