# 1. Importing Libraries

In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier

import os

### Testing Tensorflow GPU

In [3]:
tf.test.is_built_with_cuda()

True

# 2. Project Variables

In [4]:
from functions import *

In [5]:
DATA_DIR = '../train-test-data'
NUM_FOLDS = 10
TASKS_TO_RUN = ['2aii', '2aiii']

# 3. Model Training

In [12]:
# A utility method to train with k-fold
def train_kfold(num_fold, task, included_cols, train_func):
    # Read train csv
    train_df = pd.read_csv(os.path.join(DATA_DIR, f'{task}_train.csv'), index_col=0)

    # Metric arrays
    acc_per_fold = []
    loss_per_fold = []
    sens_per_fold = []
    spec_per_fold = []
    
    kfold = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=42)
    fold_no = 1
    for train_idx, val_idx in kfold.split(train_df.drop(task, axis=1), train_df[[task]]):
        train = train_df.iloc[train_idx] 
        test = train_df.iloc[val_idx]

        loss, accuracy, sensitivity, specificity = train_func(train,test, task)
        
        loss_per_fold.append(loss)
        acc_per_fold.append(accuracy)
        sens_per_fold.append(sensitivity)
        spec_per_fold.append(specificity)
        
        fold_no += 1
    
    metrics = {
        'ACCURACY': {
            'ALL': acc_per_fold,
            'MEAN': np.mean(acc_per_fold),
            'STDEV': np.std(acc_per_fold)
        },
        'SENSITIVITY': {
            'ALL': sens_per_fold,
            'MEAN': np.mean(sens_per_fold),
            'STDEV': np.std(sens_per_fold)
        },
        'SPECIFICITY': {
            'ALL': spec_per_fold,
            'MEAN': np.mean(spec_per_fold),
            'STDEV': np.std(spec_per_fold)
        }
    }
    return metrics

In [13]:
# Train neural network
def train_nn(train, test, task):
    # Generate feauture columns
    feature_columns = []
    for col in included_cols:
        feature_columns.append(tf.feature_column.numeric_column(col))

    # Generating a tensorflow dataset
    train_ds = df_to_dataset(train, task)
    test_ds = df_to_dataset(test, task)

    # Building model
    model = tf.keras.Sequential([
        tf.keras.layers.DenseFeatures(feature_columns),
        tf.keras.layers.Dense(14, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy',
                           tf.keras.metrics.TruePositives(),
                           tf.keras.metrics.TrueNegatives(),
                           tf.keras.metrics.FalsePositives(),
                           tf.keras.metrics.FalseNegatives()
                          ])

    # Fitting Model
    history = model.fit(train_ds, epochs=10, verbose=1)

    # Evaluate Model
    scores = model.evaluate(test_ds, verbose=0)
    loss, accuracy, tp, tn, fp, fn = scores
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    
    return loss, accuracy, sensitivity, specificity

In [14]:
def train_tabnet(train, test, task):
    X_train, y_train = df_to_nparray(train, task)
    X_test, y_test = df_to_nparray(test, task)

    model = TabNetClassifier()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    tp, fp, tn, fn = [0,0,0,0]
    for p,a in zip(preds, y_test):
        if (p == 'INCREASED RISK' and a == 'INCREASED RISK'): tp += 1
        elif (p == 'INCREASED RISK' and a == 'REDUCED RISK'): fp += 1
        elif (p == 'REDUCED RISK' and a == 'INCREASED RISK'): fn += 1
        else: tn += 1

    accuracy = (tp+tn)/(tp+tn+fp+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)

    return -1, accuracy, sensitivity, specificity

0.6732673267326733

In [17]:
metrics = {}

included_cols = ['CHILD_SEX','IDD_SCORE','AGE','HHID_count','HH_AGE','FOOD_EXPENSE_WEEKLY',
                 'NON-FOOD_EXPENSE_WEEKLY','HDD_SCORE','FOOD_INSECURITY','YoungBoys','YoungGirls',
                 'AverageMonthlyIncome','BEN_4PS','AREA_TYPE','FOOD_EXPENSE_WEEKLY_pc',
                 'NON-FOOD_EXPENSE_WEEKLY_pc','AverageMonthlyIncome_pc']

for task in TASKS_TO_RUN:
    metric = train_kfold(NUM_FOLDS, task, included_cols, train_nn)
    metrics[task] = metric

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [18]:
for task in TASKS_TO_RUN:
    print(f'{task}: ACCURACY: {metrics[task]["ACCURACY"]["MEAN"]} SENSITIVITY: {metrics[task]["SENSITIVITY"]["MEAN"]} SPECIFICITY: {metrics[task]["SPECIFICITY"]["MEAN"]}')

2aii: ACCURACY: 0.706881719827652 SENSITIVITY: 0.7921538461538462 SPECIFICITY: 0.2833333333333333
2aiii: ACCURACY: 0.6767741978168488 SENSITIVITY: 0.819927536231884 SPECIFICITY: 0.15476190476190474


In [8]:
metrics['2aii']['ACCURACY']['ALL']

[61.29032373428345,
 67.7419364452362,
 74.19354915618896,
 69.9999988079071,
 73.33333492279053,
 80.0000011920929,
 80.0000011920929,
 73.33333492279053,
 83.33333134651184,
 83.33333134651184]

# Model Evaluation

### Note: This runs evaluates the models with the testing set. Run only at the end.

In [87]:
def train_and_test(task, included_cols, models):
    train_df = pd.read_csv(os.path.join(DATA_DIR, f'{task}_train.csv'), index_col=0)
    test_df = pd.read_csv(os.path.join(DATA_DIR, f'{task}_test.csv'), index_col=0)
    
    # Generate feauture columns
    feature_columns = []
    for col in included_cols:
        feature_columns.append(tf.feature_column.numeric_column(col))

    # Generating a tensorflow dataset
    train_ds = df_to_dataset(train_df, task)
    test_ds = df_to_dataset(test_df, task)

    # Building model
    models[task] = tf.keras.Sequential([
        tf.keras.layers.DenseFeatures(feature_columns),
        tf.keras.layers.Dense(14, activation='relu'),
        tf.keras.layers.Dense(14, activation='relu'),
        tf.keras.layers.Dense(14, activation='relu'),
        tf.keras.layers.Dense(14, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    models[task].compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy',
                           tf.keras.metrics.TruePositives(),
                           tf.keras.metrics.TrueNegatives(),
                           tf.keras.metrics.FalsePositives(),
                           tf.keras.metrics.FalseNegatives()
                          ])

    # Fitting Model
    history = models[task].fit(train_ds, 
                        epochs=10, 
                        verbose=1)

    # Evaluate Model
    scores = models[task].evaluate(test_ds, verbose=0)
    tp, tn, fp, fn = scores[2:]
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)

    
    metrics = {
        'ACCURACY': scores[1]*100,
        'SENSITIVITY': sensitivity,
        'SPECIFICITY': specificity
    }
    return metrics

In [88]:
models = {}
metrics = {}
included_cols = ['CHILD_SEX','IDD_SCORE','AGE','HHID_count','HH_AGE','FOOD_EXPENSE_WEEKLY',
                 'NON-FOOD_EXPENSE_WEEKLY','HDD_SCORE','FOOD_INSECURITY','YoungBoys','YoungGirls',
                 'AverageMonthlyIncome','BEN_4PS','AREA_TYPE','FOOD_EXPENSE_WEEKLY_pc',
                 'NON-FOOD_EXPENSE_WEEKLY_pc','AverageMonthlyIncome_pc']

for task in TASKS_TO_RUN:
    metric = train_and_test(task, included_cols, models)
    metrics[task] = metric

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [89]:
for task in TASKS_TO_RUN:
    print(f'{task} - ACCURACY: {metrics[task]["ACCURACY"]} SENSITIVITY: {metrics[task]["SENSITIVITY"]} SPECIFICITY: {metrics[task]["SPECIFICITY"]}')

2aii - ACCURACY: 64.68647122383118 SENSITIVITY: 0.7 SPECIFICITY: 0.39622641509433965
2aiii - ACCURACY: 74.25742745399475 SENSITIVITY: 0.9079497907949791 SPECIFICITY: 0.125
