# Setup

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import time
from sklearn.preprocessing import StandardScaler

from custom_methods import model_eval

datapath = '../Data/'

startTime = time.time()

In [2]:
filename = 'ann_ready.pickle'
infile = open(datapath+filename,'rb')
data = pickle.load(infile)
X = data['X']
Y = data['Y']
infile.close()

print(type(X))
print('Format: [num_data_objects, max_sequence_length, num_vars]')
print('\nX')
print(X[1])
print('\nY')
print(Y[1])

<class 'numpy.ndarray'>
Format: [num_data_objects, max_sequence_length, num_vars]

X
[[0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.0000e+00 2.0000e+00 5.2212e+04]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.0000e+00 2.0000e+00 5.2212e+04]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.0000e+00 2.0000e+00 5.2212e+04]]

Y
['10001-114314' False]


## Split Training, Testing (To Remove)

In [3]:
'''
scalers = {}
for i in range(X.shape[1]):
    scalers[i] = StandardScaler()
    X[:, i, :] = scalers[i].fit_transform(X[:, i, :])

TEST_FRAC = 1/3

# Split Train/Test sets
df_train, df_test = model_eval.split_on_people(df, id_col=id_col, test_frac=TEST_FRAC)

# Split X and Y
X_train = df_train.drop('CMIS_MATCH', axis=1)
Y_train = df_train[['CMIS_MATCH', id_col]].groupby(id_col).first()

X_test = df_test.drop('CMIS_MATCH', axis=1)
Y_test = df_test[['CMIS_MATCH', id_col]].groupby(id_col).first()
'''

"\nscalers = {}\nfor i in range(X.shape[1]):\n    scalers[i] = StandardScaler()\n    X[:, i, :] = scalers[i].fit_transform(X[:, i, :])\n\nTEST_FRAC = 1/3\n\n# Split Train/Test sets\ndf_train, df_test = model_eval.split_on_people(df, id_col=id_col, test_frac=TEST_FRAC)\n\n# Split X and Y\nX_train = df_train.drop('CMIS_MATCH', axis=1)\nY_train = df_train[['CMIS_MATCH', id_col]].groupby(id_col).first()\n\nX_test = df_test.drop('CMIS_MATCH', axis=1)\nY_test = df_test[['CMIS_MATCH', id_col]].groupby(id_col).first()\n"

# Model

## TODO: Set Initial Bias

## Build Model

In [4]:
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping

early_stopping= EarlyStopping(
    monitor='loss',
    min_delta=0.001,
    patience=2,
    restore_best_weights=True
)

model = Sequential()

# Create input layer + LSTM
# TODO: add scaling to input layer instead of data preprocessing
model.add(
    LSTM(
        units = 64,
        input_shape = (X.shape[1], X.shape[2]),
    )
)

# Create output layer
model.add(
    Dense(
        units = 1, 
        activation='sigmoid', 
    )
)

# Compile ANN
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy']
)

# Print model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64)                20992     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 21,057
Trainable params: 21,057
Non-trainable params: 0
_________________________________________________________________
None


## K-Folds

In [5]:
'''

n = 50

X_sample = X[:n].copy()
# Y_sample = Y[:10].copy().reset_index().values
Y_sample = Y[:n].copy()

Y_sample[:3]
'''

'\n\nn = 50\n\nX_sample = X[:n].copy()\n# Y_sample = Y[:10].copy().reset_index().values\nY_sample = Y[:n].copy()\n\nY_sample[:3]\n'

In [6]:
def k_folds_ann(model, model_params:dict, x:np.ndarray, y:np.ndarray, num_folds:int, scale_axis:int, scaler):
    '''
    09/10/21
    Data Format: [num_data_objects, max_sequence_length, num_vars]
    Note: axis count begins at 0
    '''
    import copy
    
    models = [None]*num_folds
    scalers = np.reshape([None]*num_folds*x.shape[2], (num_folds,x.shape[2]))
    class_weights = [None]*num_folds
    predictions = pd.DataFrame(columns=['ID', 'CMIS_MATCH', 'prediction'])
    
    shuffled_index = np.arange(x.shape[0])
    np.random.shuffle(shuffled_index)
    num_per_fold = int(np.round(len(shuffled_index)/num_folds))
    
    for fold in range(num_folds):
        
        # Find Data Splits
        if fold != num_folds - 1:
            fold_test_index = shuffled_index[fold*num_per_fold: (fold+1)*num_per_fold]
        else:
            fold_test_index = shuffled_index[fold*num_per_fold:]
        fold_train_index = [z for z in shuffled_index if z not in fold_test_index]
       
        x_train = x[fold_train_index].copy()
        y_train = y[fold_train_index].copy()
        x_test = x[fold_train_index].copy()
        y_test = y[fold_train_index].copy()
        
        # CLass Weight
        pos = int(y_train[:,1].sum())
        if pos == 0:
            pos = 1
        total = len(y_train)
        neg = total - pos

        weight_for_0 = (1 / neg) * (total / 2.0)
        weight_for_1 = (1 / pos) * (total / 2.0)
        class_weights[fold] = {0.0:weight_for_0, 1.0:weight_for_1}
        
        # Scaling
        for var in range(X.shape[2]):
            scalers[fold, var] = copy.copy(scaler)
            x_train[:, :, var] = scalers[fold, var].fit_transform(x_train[:, :, var])
            x_test[:, :, var] = scalers[fold, var].transform(x_test[:, :, var])
        
        # Create (Copy of) Model
        models[fold] = tf.keras.models.clone_model(model)
            
        # Compile Model
        models[fold].compile(
            optimizer = model_params['OPTIMIZER'], 
            loss = model_params['LOSS'], 
            metrics = model_params['METRICS']
        )
        
        # Fit model
        models[fold].fit(
            x = tf.convert_to_tensor(x_train, np.float32),
            y = tf.convert_to_tensor(y_train[:,1], np.float32), 
            batch_size = model_params['BATCH_SIZE'], 
            epochs = model_params['EPOCHS'],
            class_weight = class_weights[fold],
            callbacks = model_params['CALLBACKS'],
            verbose = model_params['VERBOSITY'],
        )
        
        # Make Predictions for Fold
        fold_predictions = models[fold].predict(x=tf.convert_to_tensor(x_test, np.float32))
        fold_predictions = pd.concat(
            [
                pd.Series(y_test[:,0], name='ID'), 
                pd.Series(y_test[:,1], name='CMIS_MATCH'), 
                pd.Series(np.transpose(fold_predictions)[0], name='prediction')
            ], 
            axis=1, 
            join='inner',
            ignore_index=False
        )
        predictions = predictions.append(fold_predictions)
        
    # Aggregate things to return
    results = {'models':models, 'scalers':scalers, 'class_weights':class_weights, 'predictions':predictions}
    
    return results

## Run

In [7]:
NUM_FOLDS = 4

model_params = {
    'OPTIMIZER':'adam', 
    'LOSS':'binary_crossentropy', 
    'METRICS':['accuracy'],
    #'CLASS_WEIGHT':class_weight,
    'BATCH_SIZE':1,
    'EPOCHS':5,
    'CALLBACKS':[early_stopping],
    'VERBOSITY':0
    }

results = k_folds_ann(model=model, model_params=model_params, x=X, y=Y, num_folds=NUM_FOLDS, scale_axis=2, scaler=StandardScaler())

In [8]:
print(len(results['predictions']))
results['predictions'].head()

255651


Unnamed: 0,ID,CMIS_MATCH,prediction
0,258252-16023,False,0.402066
1,161480-101098,False,0.406842
2,45136-90459,False,0.393607
3,87469-122895,False,0.348661
4,177126-98814,False,0.372426


# Convert Predictions
* Take maximum risk prediction for each person.  
* If person is labelled as both positive and negative, label them as positive.

In [9]:
predictions = results['predictions'].copy()
predictions['SPA_PER_ID'] = predictions['ID'].apply(lambda x: x.split('-')[0])

matches = predictions.groupby('SPA_PER_ID')['CMIS_MATCH'].any()
predictions = pd.concat([predictions.groupby('SPA_PER_ID')['prediction'].max(), matches], axis=1, join='inner', ignore_index=False)
predictions = predictions.reset_index()
del matches

predictions.head()

Unnamed: 0,SPA_PER_ID,prediction,CMIS_MATCH
0,100002,0.564541,False
1,10001,0.586517,False
2,100011,0.565278,False
3,100012,0.551122,False
4,10002,0.614243,False


# Time and Save

In [10]:
filename = 'lstm_predictions.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(predictions, outfile)
outfile.close()

In [11]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time()-startTime)

hours:minutes:seconds = 3:49:9.42461109161377
