# Setup

In [28]:
import pandas as pd
import numpy as np
import pickle
import time
from sklearn.preprocessing import StandardScaler
from copy import deepcopy

# ANN 
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape, LSTM
from tensorflow.keras import Input
from tensorflow.keras.initializers import Constant

from custom_methods import model_eval

datapath = '../Data/'

startTime = time.time()

print(tf.__version__)
print(tf.keras.__version__)

2.3.0
2.4.0


## Import Data

In [2]:
filename = 'ann_ready.pickle'
infile = open(datapath+filename,'rb')
data = pickle.load(infile)
X = data['X']
Y = data['Y']
infile.close()

X.shape

(85217, 59, 17)

## Take subset of data

In [3]:
NUM_PEOPLE = 85217
NUM_TIMESTEPS = 59
NUM_VARS = X.shape[2]

X = X[:NUM_PEOPLE, -NUM_TIMESTEPS:, :]
Y = Y[:NUM_PEOPLE, :]

# Print info
print(f'Format: (num_data_objects, max_sequence_length, num_vars) = {X.shape}')
print(f'\nX: shape = {X.shape}')
print(X[1, :3, :5])
print(f'\nY: shape = {Y.shape}')
print(Y[1])

Format: (num_data_objects, max_sequence_length, num_vars) = (85217, 59, 17)

X: shape = (85217, 59, 17)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

Y: shape = (85217, 2)
['10001-114314' False]


# Model

## Parameters

In [4]:
early_stopping = EarlyStopping(
    monitor='loss',
    min_delta=0.001,
    patience=2,
    restore_best_weights=True
)

model_params = {
    'OPTIMIZER':'adam', 
    'LOSS':'binary_crossentropy', 
    'METRICS':['binary_crossentropy'],
    #'CLASS_WEIGHT':class_weight,
    'BATCH_SIZE':500,
    'EPOCHS':1000,
    'CALLBACKS':[early_stopping],
    'VERBOSITY':0
}

## Model Structure

In [21]:
tf.constant([8, 9], dtype=float)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([8., 9.], dtype=float32)>

In [47]:
inputs = tf.random.normal([32, 10, 8])
print(inputs.shape)
lstm = tf.keras.layers.LSTM(4)
output = lstm(inputs)
print(output.shape)

(32, 10, 8)
(32, 4)


In [50]:
def make_model(x_shape, model_params, output_bias=None):
    if output_bias is not None:
        output_bias = Constant(output_bias)
        
    model = Sequential()

    model.add(Input(shape = (x_shape[1], x_shape[2])))
    print(model.output_shape)

    # Reshape data
    model.add(Reshape(target_shape=(x_shape[2], x_shape[1],)))
    print(model.output_shape)

    # Create 1st hidden layer
    print(type(np.array(x_shape[2])))
    model.add(LSTM(units=np.array(x_shape[2])))

    # Create reshape layer
    model.add(Reshape(target_shape=(x_shape[2],)))

    # Create 2nd Hidden Layer
    model.add(Dense(units=64, activation='relu'))

    # Create output layer
    model.add(Dense(units=1, activation='sigmoid', bias_initializer=output_bias))

    # Compile ANN
    model.compile(
        optimizer=model_params['OPTIMIZER'], 
        loss=model_params['LOSS'], 
        metrics=model_params['METRICS']
    )

    return model

In [51]:
make_model(x_shape=X.shape, model_params=model_params).summary()

(None, 59, 17)
(None, 17, 59)
<class 'numpy.ndarray'>


NotImplementedError: Cannot convert a symbolic Tensor (lstm_12/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

## K-Folds Method

In [None]:
def k_folds_ann(model_params:dict, x:np.ndarray, y:np.ndarray, num_folds:int, scale_axis:int, scaler):
    '''
    11/16/21
    Data Format: [num_data_objects, max_sequence_length, num_vars]
    Note: axis count begins at 0
    '''
    
    models = [None]*num_folds
    scalers = np.reshape([None]*num_folds*x.shape[2], (num_folds,x.shape[2]))
    class_weights = [None]*num_folds
    predictions = pd.DataFrame(columns=['ID', 'CMIS_MATCH', 'prediction'])
    
    shuffled_index = np.arange(x.shape[0])
    np.random.shuffle(shuffled_index)
    num_per_fold = int(np.round(len(shuffled_index)/num_folds))
    
    for fold in range(num_folds):
        
        # Find Data Splits
        if fold != num_folds - 1:
            fold_test_index = shuffled_index[fold*num_per_fold: (fold+1)*num_per_fold]
        else:
            fold_test_index = shuffled_index[fold*num_per_fold:]
        fold_train_index = [z for z in shuffled_index if z not in fold_test_index]
       
        x_train = x[fold_train_index].copy()
        y_train = y[fold_train_index].copy()
        x_test = x[fold_test_index].copy()
        y_test = y[fold_test_index].copy()
        
        # CLass Weight
        pos = int(y_train[:,1].sum())
        if pos == 0:
            pos = 1
        total = len(y_train)
        neg = total - pos

        weight_for_0 = (1 / neg) * (total / 2.0)
        weight_for_1 = (1 / pos) * (total / 2.0)
        class_weights[fold] = {0.0:weight_for_0, 1.0:weight_for_1}
        
        # Initial Output Bias
        output_bias = np.log([pos/neg])
        
        # Scaling
        for var in range(X.shape[2]):
            scalers[fold, var] = deepcopy(scaler)
            x_train[:, :, var] = scalers[fold, var].fit_transform(x_train[:, :, var])
            x_test[:, :, var] = scalers[fold, var].transform(x_test[:, :, var])
        
        # Create and Compile new Fold Model
        models[fold] = make_model(
            x_shape = x_train.shape,
            model_params = model_params,
            output_bias = output_bias
        )
        
        # Fit model
        models[fold].fit(
            x = tf.convert_to_tensor(x_train, np.float32),
            y = tf.convert_to_tensor(y_train[:,1], np.float32), 
            batch_size = model_params['BATCH_SIZE'], 
            epochs = model_params['EPOCHS'],
            class_weight = class_weights[fold],
            callbacks = model_params['CALLBACKS'],
            verbose = model_params['VERBOSITY'],
        )
        
        # Make Predictions for Fold
        fold_predictions = models[fold].predict(x=tf.convert_to_tensor(x_test, np.float32))
        try:
            fold_predictions = pd.concat(
                [
                    pd.Series(y_test[:,0], name='ID'), 
                    pd.Series(y_test[:,1], name='CMIS_MATCH'), 
                    pd.Series(np.transpose(fold_predictions)[0], name='prediction')
                ], 
                axis=1, 
                join='inner',
                ignore_index=False
            )
        except:
            print(fold_predictions[:2])
            return
        
        predictions = predictions.append(fold_predictions)
        
    # Aggregate things to return
    results = {'models':models, 'scalers':scalers, 'class_weights':class_weights, 'predictions':predictions}
    
    return results

# Run K-Folds

In [None]:
NUM_FOLDS = 4

results = k_folds_ann(
    model_params=model_params, 
    x=X, 
    y=Y, 
    num_folds=NUM_FOLDS, 
    scale_axis=2, 
    scaler=StandardScaler()
)

In [None]:
print(len(results['predictions']))
results['predictions'].head()

In [None]:
len(results['predictions']) / results['predictions']['ID'].nunique()

## Convert Predictions
* Output currently has prediction for each person-place combo
* Take maximum risk prediction for each person.  
* If person is labelled as both positive and negative, label them as positive.

In [None]:
predictions = results['predictions'].copy()
predictions['SPA_PER_ID'] = predictions['ID'].apply(lambda x: x.split('-')[0]).astype('int')

matches = predictions.groupby('SPA_PER_ID')['CMIS_MATCH'].any()
predictions = pd.concat([predictions.groupby('SPA_PER_ID')['prediction'].max(), matches], axis=1, join='inner', ignore_index=False)
predictions = predictions.reset_index()
del matches

print(len(predictions))
predictions.head()

# Time and Save

In [None]:
filename = 'ann_predictions.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(predictions, outfile)
outfile.close()

In [None]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time()-startTime)