# Incident Analysis 

In [1]:
import re
import math
import numpy as np
import pandas as pd

from pprint import pprint
from sklearn.model_selection import train_test_split

# Neural network model
import tensorflow as tf
import tensorflow_hub as hub
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

### Directory and Client Setup
Configure with data directory and current client.

The program generates models for each client individually for privacy reasons.

In [2]:
DATA_DIR = r'../../data'

# Change client in these statements
COMPANY = 'geotec'
from clients import Geotec as client

Generate the set of text based and categorical predictors for later steps.

In [3]:
ATTRS = set(j for i in client for j in i.keys())
TEXT_PREDICTORS = set(['description'])
CATEGORICAL_PREDICTORS = set(i for i in client.incident_mapping 
                             if i not in TEXT_PREDICTORS) - set(['id'])

## Data Loading
Uses the mapping between excel columns and predictors (provided in `clients.py`) to extract the required columns.

In [4]:
# Get comma separated list of the required excel columns from each sheet.
incident_excel_cols = ','.join(client.incident_mapping.values())
action_excel_cols = ','.join(client.action_mapping.values())
factor_excel_cols = ','.join(client.factor_mapping.values())

# Get the corresponding column names in the order that they appear in the excel sheet.
def sort_cols(mapping_dict):
    return [k for k, v in sorted(mapping_dict.items(), key=lambda x: x[1])]

incident_names = sort_cols(client.incident_mapping)
action_names = sort_cols(client.action_mapping)
factor_names = sort_cols(client.factor_mapping)

In [5]:
# Load the data
incidents = pd.read_excel(f'{DATA_DIR}/{COMPANY}-incidents.xlsx', 
                          usecols=incident_excel_cols, skiprows=9, names=incident_names)
actions = pd.read_excel(f'{DATA_DIR}/{COMPANY}-actions.xlsx', 
                        usecols=action_excel_cols, skiprows=6, names=action_names)
factors = pd.read_excel(f'{DATA_DIR}/{COMPANY}-factors.xlsx', 
                        usecols=factor_excel_cols, skiprows=5, names=factor_names)

def process_ids(df):
    """Drops cols with no id's, sets integer id's, and mades id the index."""
    df = df.dropna(subset=['id'])
    df.loc[:,'id'] = df.loc[:,'id'].astype('int64')
    return df.set_index('id')

incidents = process_ids(incidents)
actions = process_ids(actions)
full_factors = process_ids(factors)
factors = full_factors.drop(['factor-text'], axis=1)

  warn(msg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## One-Hot Encoding
Performs one-hot encoding on the categorical attributes to prepare the instances for input into ML algorithms.

In [7]:
def one_hot_categoricals(df, categoricals: list[str], 
                         dummy_na=True, na_sentinel=None) -> tuple[pd.DataFrame, dict]:
    # Remove the index ID (replaced on return)
    df = df.reset_index()

    # Stores the mapping between new columns and existing categories
    mappings = {}
    for column in categoricals:
        # Get integer categories
        factorised, mapping = pd.factorize(df[column], na_sentinel=na_sentinel)
        mappings[column] = mapping

        # One-hot encode
        dummies = pd.get_dummies(factorised, dummy_na=dummy_na, prefix=column)

        # Replace the existing categorical col with the one-hots
        df = df.drop([column], axis=1)
        df = df.reset_index(drop=True)
        dummies = dummies.reset_index(drop=True)
        df = pd.concat([df, dummies], axis=1)
    
    return df.set_index('id'), mappings

In [8]:
# Create a working copy of the incidents dataframe for ML
df = incidents.copy()
df, mapping = one_hot_categoricals(df, CATEGORICAL_PREDICTORS)

## Target Engineering

In [9]:
# Count the number of actions associated with each incident
temp = incidents.join(actions, on='id')
n_actions = temp.groupby('id').count()['action-id']

# Repeat for factors
temp = incidents.join(factors, on='id')
n_factors = temp.groupby('id').count()['factor-level']

In [10]:
# One-hot encode the factor levels - possible memory issues with ridiculous number of factors
temp, factor_mapping = one_hot_categoricals(temp, ['factor-level'], dummy_na=False, na_sentinel=-1)

# factor_codes has one row for each instance, and 0-many columns with 1's representing
# boolean flags of whether each factor (column) is present for that instance
factor_cols = [col for col in temp.columns if 'factor-level_' in col]
factor_codes = temp[factor_cols].groupby('id').sum()

del temp

In [11]:
# Append the target columns
df = df.join(factor_codes, on='id')
df['n_actions'] = n_actions
df['n_factors'] = n_factors

## Text Embedding
Uses a prebuilt text embedding model from Google to convert each text based predictor into a 50-dimensional vector, where nearby vectors in this 50-D space are assumed to be generated from strings with similar meaning.

In [12]:
# Load the embedding model
embedding_dim = 50
embedding_model = "https://tfhub.dev/google/nnlm-en-dim50/2"
embedding_layer = hub.KerasLayer(embedding_model, input_shape=[], dtype=tf.string, trainable=True)

2021-12-16 09:38:48.713734: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2021-12-16 09:38:48.809817: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2021-12-16 09:38:48.810350: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2021-12-16 09:38:48.815734: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [13]:
def embed_text(df, texts: list[str]) -> pd.DataFrame:
    """
    Takes a dataframe and a list of n text columns. Returns a dataframe with
    those n text columns replaced by 50n columns containing the aforementioned
    text embedding.
    """
    for column in texts:
        res = np.zeros((len(df[column]), embedding_dim))

        # Iterate over all the rows of the text column
        for idx, txt in df.reset_index()[column].items():
            if type(txt) is not str:
                continue
            
            # If the cell is text, perform and store the embedding
            res[idx] = embedding_layer([txt])[0]

        # Add the embedding back into the dataframe
        df = df.drop([column], axis=1)
        col_names = [f'{column}_{i}' for i in range(50)]
        df[col_names] = res

    return df

In [14]:
# Perform the embedding and convert all the columns to floats now that all are numeric.
df = embed_text(df, TEXT_PREDICTORS)
df = df.astype(float)

In [15]:
# Optionally save to excel for viewing
# df.to_excel('test.xlsx')

## Modelling
### Train / Val / Test Split
Split the data for training, validation, and testing of the ML algorithm.

In [16]:
train, temp = train_test_split(df, train_size=0.7, random_state=22)
val, test = train_test_split(temp, train_size=0.5, random_state=22)

In [17]:
def split_xy(df):
    """
    Takes a dataframe formatted from above, and splits it into the required
    numeric and categorical predictors and targets for analysis.

    Note that the categorical and numeric return values will not always have the
    same amount of rows, since instances without factors are not included in the
    categorical dataset (since the categories are factor levels and these
    have no factors.)
    """
    # Retain only columns with factors for the categorical dataset
    cat_df = df[df['factor-level_-1'] == 0]
    cat_df = cat_df.drop('factor-level_-1', axis=1)

    # Split the predictors and targets
    y_cat = cat_df[[col for col in cat_df.columns if 'factor-level_' in col]]
    y_num = df[['n_actions', 'n_factors']]
    X_cat = cat_df[[col for col in cat_df.columns if col not in y_num.columns and col not in y_cat.columns]]
    X_num = df[[col for col in df.columns if col not in y_num.columns and col not in y_cat.columns]]

    return X_cat, X_num, y_cat, y_num

In [18]:
# Perform the split on each of the train, validation, and test datasets
X_cat_train, X_num_train, y_cat_train, y_num_train = split_xy(train)
X_cat_val, X_num_val, y_cat_val, y_num_val = split_xy(val)
X_cat_test, X_num_test, y_cat_test, y_num_test = split_xy(test)

### NN Models
#### Numeric Targets
A model to predict the number of factors and actions for a provided instance.

In [19]:
# Uses an input layer, two fully-connected hidden layers, and a 2D output layer
num_model = Sequential([
    Dense(200, activation='relu'),
    Dense(80, activation='relu'),
    Dense(2, activation='relu')  # ReLU since the n_{actions, factors} are +'ve
])

num_model.compile(optimizer='adam', loss='mse')

In [20]:
# Train the neural network
history = num_model.fit(X_num_train,
                        y_num_train,
                        epochs=200,
                        batch_size=32,
                        validation_data=(X_num_val, y_num_val),
                        callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

Epoch 1/200


2021-12-16 09:39:10.170270: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200


#### Categorical Targets

In [21]:
# Train a very similar model to predict the most likely factor levels
cat_model = Sequential([
    Dense(200, activation='relu'),
    Dense(80, activation='relu'),
    # Use a softmax activation on the output for categorical probabilities
    Dense(len(y_cat_train.columns), activation='softmax')
])

cat_model.compile(optimizer='adam', loss='mse')

In [22]:
# Train the categorical model stopping at minimum validation loss
history = cat_model.fit(X_cat_train,
                        y_cat_train,
                        epochs=200,
                        batch_size=32,
                        validation_data=(X_cat_val, y_cat_val),
                        callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200


## Testing

### Functions to Get Predictions for an Instance

In [23]:
def factor_text(factor_code):
    """Gets the last recorded factor description for a given factor code."""
    return full_factors[full_factors['factor-level'] == factor_code].iloc[-1]['factor-text']

def reverse_factor_mapping(indices):
    """
    Takes a set of integer factor codes (factor codes which have been 
    transformed from the original floating point codes into 0...n integers 
    through one-hot encoding) and returns the corresponding original factor
    codes and factor text.
    """
    reverse_map = factor_mapping['factor-level']
    factor_codes = [reverse_map[idx] for idx in indices]

    return [(code, factor_text(code)) for code in factor_codes]

def get_predictions(cat_xs, num_xs, top=5, get_truth=True, incident_src=None):
    """
    Takes two dataframes of categorical and numeric test instances and gets the 
    neural network predictions from both models.
    
    Returns a dictionary with:
        Keys:
            The keys from the input dataframe.
        Values:
            A dictionary containing:
                - description: the provided text description
                - predicted factors: a list of the top 5 (default) factors most
                    likely to be included based on the provided input data (as
                    predicted by the categorical neural network model.)
                - predicted number of actions: as titled
                - predicted number of factors: as titled
                - [if get_truth] true factors: the true factor numbers and text
                - [if get_truth] true number of actions: as titled
                - [if get_truth] true number of factors: as titled
    """
    cat_predictions = cat_model.predict(cat_xs)
    num_predictions = num_model.predict(num_xs)

    if incident_src is None:
        incident_src = incidents

    res = {}
    for idx, pred in enumerate(cat_predictions):
        item_id = cat_xs.index[idx]

        # Get the indices of the factors with the largest probability
        top_indices = np.argsort(pred)[:-top-1:-1]

        # Get the probabilities of each index
        top_probs = cat_predictions[idx][top_indices]
        
        # Get the relevant factor text
        pred_text = reverse_factor_mapping(top_indices)

        # Add the information to the dictionary
        res[item_id] = {
            'description': incident_src.loc[item_id]['description'], 
            'predicted factors': list(zip(pred_text, top_probs)), 
            'predicted number of actions': num_predictions[idx][0],
            'predicted number of factors': num_predictions[idx][1]
        }

        if get_truth:
            # Get information about the correct factor labels
            true_row = df.loc[item_id][[col for col in df.columns if 'factor-level_' in col]]
            true_factors = true_row[true_row == 1].index
            true_codes = [int(factor.split('_')[-1]) for factor in true_factors]

            true_text = reverse_factor_mapping(true_codes)

            res[item_id]['true factors'] = true_text

            res[item_id]['true number of actions'] = df.loc[item_id, 'n_actions']
            res[item_id]['true number of factors'] = df.loc[item_id, 'n_factors']

    return res

In [29]:
# Samples from the test dataset and prints the output for testing.
cat_sample = X_cat_test.sample()
num_sample = X_num_test.loc[cat_sample.index]

pprint(get_predictions(cat_sample, num_sample))

{6322: {'description': 'Left Bank: Dewatering Pump – Diesel lost to ground '
                       '(less than 5 litres). The pump was out of service had '
                       'been placed in a staging area on the left bank in '
                       'readiness for the Stage 2 River Diversion. The pump '
                       'was positioned on unlevel ground causing the fuel in '
                       'the fuel tank (within the skid of the pump) to move to '
                       'the lower end near the fuel filling point. Fuel '
                       'escaped from the small breather hole in the filler '
                       'cap.',
        'predicted factors': [((1.2, 'Inattention to details of job'),
                               0.3214381),
                              ((1.1, 'Job planning or instruction inadequate'),
                               0.26404),
                              ((2.1, 'Rules, procedures or SWMS not followed'),
                               0

#### List of all the available factor levels

In [25]:
[(i, factor_text(i)) for i in factor_mapping['factor-level']]

[(4.0, 'Just and Fair Culture Matter'),
 (2.1, 'Rules, procedures or SWMS not followed'),
 (10.0, 'Environmental factors, weather'),
 (1.2, 'Inattention to details of job'),
 (13.0, 'Other Contributing Factors'),
 (1.1, 'Job planning or instruction inadequate'),
 (3.2, 'Guarding or protective devices not provided or ineffective'),
 (3.1, 'Design of plant, facilities, or equipment'),
 (3.0, 'Rules, procedures or JSA not followed'),
 (6.0, 'Housekeeping congested, incorrect storage'),
 (3.5, 'Improper vehicle operation'),
 (11.0, 'Inadequate knowledge or skill'),
 (5.0, '5. Incorrect body position in relation to work'),
 (12.0, 'Member of the public'),
 (3.3, 'Plant or equipment operated incorrectly'),
 (7.0, 'Incorrect or lack of Personal Protective Equipment'),
 (8.0, 'Inadequate knowledge or skill'),
 (2.2, 'Rules, procedures or SWMS inadequate'),
 (3.6, 'Maintenance, Inspection not adequate'),
 (2.0, 'Job planning or instruction inadequate'),
 (3.4, 'Incorrect tools or mechanical aid

#### Function to create instances with fake descriptions 

In [26]:
def create_test_instance(description):
    test_incident = pd.DataFrame({'description': [description]}, columns=incidents.columns)
    test_incident.index.rename('id', inplace=True)

    dummy_cols = [col for col in X_test.columns if not re.match("description_", col)]
    dummy_cols += ['description']
    test_instance = pd.DataFrame({'description': [description]}, columns=dummy_cols)

    test_instance = embed_text(test_instance, ['description']).fillna(0)

    return test_incident, test_instance

#### Create and get predictions for fake instances

In [27]:
test_incident, test_instance = create_test_instance('bob was not wearing safety goggles')

NameError: name 'X_test' is not defined

In [None]:
pprint(
    get_factor_predictions(test_instance, 
                           incident_src=test_incident, 
                           get_true_factors=False)
)

[0.05491083 0.06921741 0.05070435 0.0982523  0.0951962  0.07404085
 0.02881607 0.03856122 0.06665833 0.03609989 0.02325817 0.01883799
 0.08037991 0.06353504 0.02461661 0.02037402 0.02442184 0.02081852
 0.05791007 0.01842437 0.01744263 0.0175233 ]
[ 3  4 12  5  1]
{0: {'description': 'bob was not wearing safety goggles',
     'predicted_factors': [(1.2, 'Inattention to details of job'),
                           (13.0, 'Improper vehicle operation'),
                           (5.0,
                            '5. Incorrect body position in relation to work'),
                           (1.1, 'Job planning or instruction inadequate'),
                           (2.1, 'Rules, procedures or SWMS not followed')]}}
