# Setup

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import time
from sklearn.preprocessing import StandardScaler

from custom_methods import model_eval

datapath = '../Data/'

startTime = time.time()

In [2]:
filename = 'ann_ready.pickle'
infile = open(datapath+filename,'rb')
data = pickle.load(infile)
X = data['X']
Y = data['Y']
infile.close()

print(type(X))
print('Format: [num_data_objects, max_sequence_length, num_vars]')
print('\nX')
print(X[1])
print('\nY')
print(Y[1])

<class 'numpy.ndarray'>
Format: [num_data_objects, max_sequence_length, num_vars]

X
[[0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.0000e+00 2.0000e+00 5.2212e+04]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.0000e+00 2.0000e+00 5.2212e+04]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.0000e+00 2.0000e+00 5.2212e+04]]

Y
['10001-114314' False]


## Split Training, Testing (To Remove)

In [3]:
'''
scalers = {}
for i in range(X.shape[1]):
    scalers[i] = StandardScaler()
    X[:, i, :] = scalers[i].fit_transform(X[:, i, :])

TEST_FRAC = 1/3

# Split Train/Test sets
df_train, df_test = model_eval.split_on_people(df, id_col=id_col, test_frac=TEST_FRAC)

# Split X and Y
X_train = df_train.drop('CMIS_MATCH', axis=1)
Y_train = df_train[['CMIS_MATCH', id_col]].groupby(id_col).first()

X_test = df_test.drop('CMIS_MATCH', axis=1)
Y_test = df_test[['CMIS_MATCH', id_col]].groupby(id_col).first()
'''

"\nscalers = {}\nfor i in range(X.shape[1]):\n    scalers[i] = StandardScaler()\n    X[:, i, :] = scalers[i].fit_transform(X[:, i, :])\n\nTEST_FRAC = 1/3\n\n# Split Train/Test sets\ndf_train, df_test = model_eval.split_on_people(df, id_col=id_col, test_frac=TEST_FRAC)\n\n# Split X and Y\nX_train = df_train.drop('CMIS_MATCH', axis=1)\nY_train = df_train[['CMIS_MATCH', id_col]].groupby(id_col).first()\n\nX_test = df_test.drop('CMIS_MATCH', axis=1)\nY_test = df_test[['CMIS_MATCH', id_col]].groupby(id_col).first()\n"

# Model

## TODO: Set Initial Bias

## Build Model

In [4]:
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, SpatialDropout1D
from keras.callbacks import EarlyStopping

early_stopping= EarlyStopping(
    monitor='loss',
    min_delta=0.001,
    patience=2,
    restore_best_weights=True
)

model = Sequential()

# Create input layer + 1st hidden layer
# TODO: add scaling to input layer instead of data preprocessing
model.add(
    Dense(
        units = 64,
        input_shape = (X.shape[1], X.shape[2]),
        activation='relu'
    )
)

# Create 2nd hidden layer
model.add(
    Dense(
        units = 32, 
        activation='relu', 
    )
)

# Create output layer
model.add(
    Dense(
        units = 1, 
        activation='sigmoid', 
    )
)

# Compile ANN
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy']
)

# Print model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 59, 64)            1152      
_________________________________________________________________
dense_1 (Dense)              (None, 59, 32)            2080      
_________________________________________________________________
dense_2 (Dense)              (None, 59, 1)             33        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________
None


## K-Folds

In [5]:
'''

n = 50

X_sample = X[:n].copy()
# Y_sample = Y[:10].copy().reset_index().values
Y_sample = Y[:n].copy()

Y_sample[:3]
'''

'\n\nn = 50\n\nX_sample = X[:n].copy()\n# Y_sample = Y[:10].copy().reset_index().values\nY_sample = Y[:n].copy()\n\nY_sample[:3]\n'

In [8]:
def k_folds_ann(model, model_params:dict, x:np.ndarray, y:np.ndarray, num_folds:int, scale_axis:int, scaler):
    '''
    09/10/21
    Data Format: [num_data_objects, max_sequence_length, num_vars]
    Note: axis count begins at 0
    '''
    import copy
    
    models = [None]*num_folds
    scalers = np.reshape([None]*num_folds*x.shape[2], (num_folds,x.shape[2]))
    class_weights = [None]*num_folds
    predictions = pd.DataFrame(columns=['ID', 'CMIS_MATCH', 'prediction'])
    
    shuffled_index = np.arange(x.shape[0])
    np.random.shuffle(shuffled_index)
    num_per_fold = int(np.round(len(shuffled_index)/num_folds))
    
    for fold in range(num_folds):
        
        # Find Data Splits
        if fold != num_folds - 1:
            fold_test_index = shuffled_index[fold*num_per_fold: (fold+1)*num_per_fold]
        else:
            fold_test_index = shuffled_index[fold*num_per_fold:]
        fold_train_index = [z for z in shuffled_index if z not in fold_test_index]
       
        x_train = x[fold_train_index].copy()
        y_train = y[fold_train_index].copy()
        x_test = x[fold_train_index].copy()
        y_test = y[fold_train_index].copy()
        
        # CLass Weight
        pos = int(y_train[:,1].sum())
        if pos == 0:
            pos = 1
        total = len(y_train)
        neg = total - pos

        weight_for_0 = (1 / neg) * (total / 2.0)
        weight_for_1 = (1 / pos) * (total / 2.0)
        class_weights[fold] = {0.0:weight_for_0, 1.0:weight_for_1}
        
        # Scaling
        for var in range(X.shape[2]):
            scalers[fold, var] = copy.copy(scaler)
            x_train[:, :, var] = scalers[fold, var].fit_transform(x_train[:, :, var])
            x_test[:, :, var] = scalers[fold, var].transform(x_test[:, :, var])
        
        # Create (Copy of) Model
        models[fold] = tf.keras.models.clone_model(model)
            
        # Compile Model
        models[fold].compile(
            optimizer = model_params['OPTIMIZER'], 
            loss = model_params['LOSS'], 
            metrics = model_params['METRICS']
        )
        
        # Fit model
        models[fold].fit(
            x = tf.convert_to_tensor(x_train, np.float32),
            y = tf.convert_to_tensor(y_train[:,1], np.float32), 
            batch_size = model_params['BATCH_SIZE'], 
            epochs = model_params['EPOCHS'],
            class_weight = class_weights[fold],
            callbacks = model_params['CALLBACKS'],
            verbose = model_params['VERBOSITY'],
        )
        
        # Make Predictions for Fold
        fold_predictions = models[fold].predict(x=tf.convert_to_tensor(x_test, np.float32))
        try:
            fold_predictions = pd.concat(
                [
                    pd.Series(y_test[:,0], name='ID'), 
                    pd.Series(y_test[:,1], name='CMIS_MATCH'), 
                    pd.Series(np.transpose(fold_predictions)[0], name='prediction')
                ], 
                axis=1, 
                join='inner',
                ignore_index=False
            )
        except:
            print(fold_predictions[:2])
            return
        
        predictions = predictions.append(fold_predictions)
        
    # Aggregate things to return
    results = {'models':models, 'scalers':scalers, 'class_weights':class_weights, 'predictions':predictions}
    
    return results

## Run

In [9]:
NUM_FOLDS = 4

model_params = {
    'OPTIMIZER':'adam', 
    'LOSS':'binary_crossentropy', 
    'METRICS':['accuracy'],
    #'CLASS_WEIGHT':class_weight,
    'BATCH_SIZE':1,
    'EPOCHS':5,
    'CALLBACKS':[early_stopping],
    'VERBOSITY':0
    }

results = k_folds_ann(model=model, model_params=model_params, x=X, y=Y, num_folds=NUM_FOLDS, scale_axis=2, scaler=StandardScaler())

[[[0.48216754]
  [0.46457064]
  [0.46434337]
  [0.47049972]
  [0.4548512 ]
  [0.4515954 ]
  [0.44615895]
  [0.44503668]
  [0.43597454]
  [0.43122956]
  [0.42315102]
  [0.41749558]
  [0.4091851 ]
  [0.4064115 ]
  [0.40172604]
  [0.40275282]
  [0.39038762]
  [0.38466597]
  [0.38085762]
  [0.37503627]
  [0.36560294]
  [0.35896286]
  [0.35047138]
  [0.34663132]
  [0.3384894 ]
  [0.3366785 ]
  [0.33131748]
  [0.32901925]
  [0.3221467 ]
  [0.3163021 ]
  [0.3103571 ]
  [0.30435425]
  [0.2990538 ]
  [0.29273546]
  [0.29017347]
  [0.28450269]
  [0.27453363]
  [0.26829067]
  [0.2631693 ]
  [0.2561535 ]
  [0.2512868 ]
  [0.24284253]
  [0.23550475]
  [0.22848684]
  [0.21950468]
  [0.21108648]
  [0.20028341]
  [0.19162926]
  [0.18390629]
  [0.18015644]
  [0.17231244]
  [0.15748665]
  [0.14471573]
  [0.12876353]
  [0.11275852]
  [0.09345707]
  [0.43915868]
  [0.44869986]
  [0.8001285 ]]

 [[0.5941407 ]
  [0.5692766 ]
  [0.5703401 ]
  [0.58034104]
  [0.5666581 ]
  [0.72834706]
  [0.32295722]
  [0.573

In [None]:
print(len(results['predictions']))
results['predictions'].head()

# Convert Predictions
* Take maximum risk prediction for each person.  
* If person is labelled as both positive and negative, label them as positive.

In [None]:
predictions = results['predictions'].copy()
predictions['SPA_PER_ID'] = predictions['ID'].apply(lambda x: x.split('-')[0])

matches = predictions.groupby('SPA_PER_ID')['CMIS_MATCH'].any()
predictions = pd.concat([predictions.groupby('SPA_PER_ID')['prediction'].max(), matches], axis=1, join='inner', ignore_index=False)
predictions = predictions.reset_index()
del matches

predictions.head()

# Time and Save

In [None]:
filename = 'ann_predictions.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(predictions, outfile)
outfile.close()

In [None]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time()-startTime)