# Neural Network

## Imports

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, recall_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

## Load the data

In [7]:
# Load the CSV file
file_path = '/Users/minarandolf/Desktop/Capstone Project/contract_classification.csv'
df = pd.read_csv(file_path)

# Convert the 'contract_year' column to datetime
df['contract_year'] = pd.to_datetime(df['contract_year'], format='%Y-%m-%d')
df['year'] = df['contract_year'].dt.year

# Ensure all categorical columns are treated as such
categorical_columns = ['CORPORATE_DEVISION', 'ORTS-NAME', 'STRASSE', 'CONSTRACTION_DESIGN', 'ZONE', 'PRODUCTLINE', 'UNDERWRITER', 'PARTY-ID']
df[categorical_columns] = df[categorical_columns].astype(str)

print(f"Number of observations: {len(df)}")
print("Column names:", df.columns)
print("Column types:", df.dtypes)

  df = pd.read_csv(file_path)


Number of observations: 11574439
Column names: Index(['ANO_SID', 'CORPORATE_DEVISION', 'ORTPLZ', 'ORTS-NAME', 'STRASSE',
       'SUM_INSURED', 'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL',
       'ZONE', 'SF-SYSTEM', 'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED',
       'PRODUCTLINE', 'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'PARTY-ID',
       'contract_year', 'PIPE_PREMIUM_AMOUNT', 'YEAR', 'DAMAGE', 'year'],
      dtype='object')
Column types: ANO_SID                       float64
CORPORATE_DEVISION             object
ORTPLZ                        float64
ORTS-NAME                      object
STRASSE                        object
SUM_INSURED                   float64
CONSTRACTION_DESIGN            object
CONSTRUCTION_YEAR             float64
WFL                           float64
ZONE                           object
SF-SYSTEM                     float64
TYPE_OF_DEDUCTIBLE              int64
DRAIN_PIPE_INSURED              int64
PRODUCTLINE                    object
PRIOR_DAMAGES   

## Define Preprocessing Function

In [8]:
def preprocess_data(df):
    # Handle missing values
    df.fillna('missing', inplace=True)

    # Convert categorical columns to dummy variables
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    # Separate features and target
    X = df.drop('DAMAGE', axis=1)
    y = df['DAMAGE']

    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y

## Define Neural Network Model

In [9]:
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Recall'])
    return model

## Rolling Window Estimation

In [None]:
years = sorted(df['year'].unique())

recall_scores = []

for i in range(len(years) - 1):
    train_year = years[i]
    test_year = years[i + 1]

    train_df = df[df['year'] == train_year]
    test_df = df[df['year'] == test_year]

    X_train, y_train = preprocess_data(train_df)
    X_test, y_test = preprocess_data(test_df)

    # Convert to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

    # Create and train the model
    model = create_model(X_train.shape[1])
    model.fit(train_dataset, epochs=10, verbose=1)

    # Evaluate the model
    loss, recall = model.evaluate(test_dataset, verbose=1)
    print(f"Year {train_year} to {test_year} - Recall: {recall:.4f}")
    recall_scores.append(recall)

print("Average Recall over all years:", np.mean(recall_scores))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna('missing', inplace=True)
