In [1]:
import tensorflow as tf
import zipfile
tfkl = tf.keras.layers
import csv
import numpy as np
import pandas as pd
import os as os
import matplotlib.pyplot as plt
#from keras.layers import Input, Embedding, Flatten, Dense, BatchNormalization, Dropout, concatenate
#from keras.models import Model
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load in Data

In [2]:
# loading in USMDB data
data = []
ages = []
states = []
genders = []

with open("../data/usmdb/usmdb.csv", "r") as file:
    reader = csv.reader(file,delimiter=',')
    for row_index, row in enumerate(reader):
        if row_index == 0:
            print(row)
        if row_index >= 1:
            state, gender, year, age, rate = row
            year = int(year)
            try:
                age = int(age)
            except:
                age = -1
            if state not in states:
                states.append(state)
            state = states.index(state)
            if gender not in genders:
                genders.append(gender)
            gender = genders.index(gender)
            try:
                rate = float(rate)
            except:
                rate = -1
            if rate > 1:
                rate = 1
            # get rid of years, ages, not in health data and other cleaning
            if age != -1 and rate != -1 and age <= 99 and age >=18 and year >= 1993:
                data.append([state, gender, year, age, rate])

data = np.array(data)

['PopName', 'Sex', 'Year', 'Age', 'mx']


In [3]:
print(data[:,0])
print(data)
print(states)
print(genders)

[ 0.  0.  0. ... 49. 49. 49.]
[[0.0000e+00 0.0000e+00 1.9930e+03 1.8000e+01 2.7000e-04]
 [0.0000e+00 0.0000e+00 1.9930e+03 1.9000e+01 0.0000e+00]
 [0.0000e+00 0.0000e+00 1.9930e+03 2.0000e+01 1.0700e-03]
 ...
 [4.9000e+01 1.0000e+00 2.0200e+03 9.7000e+01 3.8917e-01]
 [4.9000e+01 1.0000e+00 2.0200e+03 9.8000e+01 4.2072e-01]
 [4.9000e+01 1.0000e+00 2.0200e+03 9.9000e+01 4.5292e-01]]
['AK', 'AL', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
['f', 'm']


In [4]:
df = pd.read_csv("../data/brfss/brfss_agg.csv", index_col=0)

Last minute cleaning. Will put chunk below into cleaning file soon.

In [5]:
# removing missing - should add this to cleaning dataset in the future
index_missing = df[ (df['sex'] == '7')].index
df.drop(index_missing, inplace=True)

# Mapping from FIPS code to state abbreviation
fips_to_abbreviation = {
    1: 'AL', 2: 'AK', 4: 'AZ', 5: 'AR', 6: 'CA', 8: 'CO', 9: 'CT', 10: 'DE', 12: 'FL',
    13: 'GA', 15: 'HI', 16: 'ID', 17: 'IL', 18: 'IN', 19: 'IA', 20: 'KS', 21: 'KY',
    22: 'LA', 23: 'ME', 24: 'MD', 25: 'MA', 26: 'MI', 27: 'MN', 28: 'MS', 29: 'MO',
    30: 'MT', 31: 'NE', 32: 'NV', 33: 'NH', 34: 'NJ', 35: 'NM', 36: 'NY', 37: 'NC',
    38: 'ND', 39: 'OH', 40: 'OK', 41: 'OR', 42: 'PA', 44: 'RI', 45: 'SC', 46: 'SD',
    47: 'TN', 48: 'TX', 49: 'UT', 50: 'VT', 51: 'VA', 53: 'WA', 54: 'WV', 55: 'WI',
    56: 'WY'
}

# Convert FIPS codes to state abbreviations
df['state'] = df['state'].map(fips_to_abbreviation)

print(df)


      state    age     sex  year        bmi
0        AL  18-24  female  1993  21.704250
1        AL  18-24  female  1994  23.100608
2        AL  18-24  female  1995  23.313510
3        AL  18-24  female  1996  24.502406
4        AL  18-24  female  1997  23.347746
...     ...    ...     ...   ...        ...
38849    WY    80+    male  2017  27.069832
38850    WY    80+    male  2018  26.646026
38851    WY    80+    male  2019  26.365848
38852    WY    80+    male  2020  26.726692
38853    WY    80+    male  2021  26.220133

[37570 rows x 5 columns]


In [6]:
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df['state'] = df['state'].astype('category')
df['state'] = df['state'].apply(lambda x: states.index(x))
df['state'] = df['state'].astype('int64')
df['age'] = df['age'].astype('category')
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
print(df.dtypes)

health_data = df.to_numpy()

state       int64
age      category
sex         int64
year        int64
bmi       float64
dtype: object


# Train no covariate model

In [7]:
# # training and test sets 
# training_index = np.logical_and(data[:, 2] >= 1998, data[:, 2] <= 2010)
# training_data = data[training_index, :]
# print(training_data.shape)

# test_index = np.logical_and(data[:, 2] > 2010, data[:, 2] <= 2015)
# test_data = data[test_index, :]

In [8]:
# training_data = tf.convert_to_tensor(training_data)
# test_data = tf.convert_to_tensor(test_data)
# # cast tensor to type float32
# training_data = tf.cast(training_data, tf.float32)
# test_data = tf.cast(test_data, tf.float32)
# num_train = training_data.shape[0]
# num_test = test_data.shape[0]

In [9]:
# # define function to fetch and process data entries from training or test data 
# def get_data(index, mode):
#     if mode == "train":
#         # randomly selects index from training data between 0 and num_train
#         rand_index = tf.random.uniform([],minval=0, maxval=num_train, dtype=tf.int32) 
#         entry = training_data[rand_index, :]
#     elif mode == "not_random":
#         # selects specified index from test data 
#         entry = test_data[index, :]
#     else: 
#         # for any other value of mode, randomly selects index from test
#         rand_index = tf.random.uniform([],minval=0, maxval=num_test, dtype=tf.int32)
#         entry = test_data[rand_index, :]
#     state, gender, year, age, rate = entry[0], entry[1], entry[2], entry[3], entry[4]
#     year = (year - 1998)/21
#     age = tf.cast(age, tf.int32)
#     state = tf.cast(state, tf.int32)
#     gender = tf.cast(gender, tf.int32)
#     year = tf.reshape(year, [1])
#     age = tf.reshape(age, [1])
#     state = tf.reshape(state, [1])
#     gender = tf.reshape(gender, [1])
#     rate = tf.reshape(rate, [1])
#     return (year, age, state, gender), rate

In [10]:
# use get_data function to set up training and test tensorflow datasets 
# dataset_train = tf.data.Dataset.from_tensor_slices(np.arange(10000))
# dataset_train = dataset_train.repeat()
# dataset_train = dataset_train.map(lambda x: get_data(x, mode="train"), num_parallel_calls=4)
# dataset_train = dataset_train.batch(256)
# dataset_train = dataset_train.prefetch(buffer_size=512)

# dataset_test = tf.data.Dataset.from_tensor_slices(np.arange(10000))
# dataset_test = dataset_test.repeat()
# dataset_test = dataset_test.map(lambda x: get_data(x, mode="test"), num_parallel_calls=4)
# dataset_test = dataset_test.batch(256)
# dataset_test = dataset_test.prefetch(buffer_size=512)

# dataset_test2 = tf.data.Dataset.from_tensor_slices(np.arange(68000))
# dataset_test2 = dataset_test2.map(lambda x: get_data(x, mode="not_random"), num_parallel_calls=4)
# dataset_test2 = dataset_test2.batch(256)
# dataset_test2 = dataset_test2.prefetch(buffer_size=512)

In [11]:
# defining inputs 
# year = tfkl.Input(shape=(1,), dtype='float32', name='Year')
# age =  tfkl.Input(shape=(1,), dtype='int32', name='Age')
# state = tfkl.Input(shape=(1,), dtype='int32', name='State')
# gender = tfkl.Input(shape=(1,), dtype='int32', name='Gender')

# # defining embedding layers 
# age_embed = tfkl.Embedding(input_dim=100, output_dim=5, input_length=1, name='Age_embed')(age)
# age_embed = tfkl.Flatten()(age_embed)

# gender_embed = tfkl.Embedding(input_dim=2, output_dim=5, input_length=1, name='Gender_embed')(gender)
# gender_embed = tfkl.Flatten()(gender_embed)

# state_embed = tfkl.Embedding(input_dim=50, output_dim=5, input_length=1, name='State_embed')(state)
# state_embed = tfkl.Flatten()(state_embed)

# # create feature vector that concatenates all inputs 
# x = tfkl.Concatenate()([year, age_embed, gender_embed, state_embed])
# x1 = x

# # setting up middle layers 
# x = tfkl.Dense(128, activation='tanh')(x)
# x = tfkl.BatchNormalization()(x)
# x = tfkl.Dropout(0.05)(x)

# x = tfkl.Dense(128, activation='tanh')(x)
# x = tfkl.BatchNormalization()(x)
# x = tfkl.Dropout(0.05)(x)

# x = tfkl.Dense(128, activation='tanh')(x)
# x = tfkl.BatchNormalization()(x)
# x = tfkl.Dropout(0.05)(x)

# x = tfkl.Dense(128, activation='tanh')(x)
# x = tfkl.BatchNormalization()(x)
# x = tfkl.Dropout(0.05)(x)

# # setting up output layer 
# x = tfkl.Concatenate()([x1, x])
# x = tfkl.Dense(128, activation='tanh')(x)
# x = tfkl.BatchNormalization()(x)
# x = tfkl.Dropout(0.05)(x)
# x = tfkl.Dense(1, activation='sigmoid', name='final')(x)

# # creating the model 
# model = tf.keras.Model(inputs=[year, age, state, gender], outputs=[x])

In [12]:
# compiling the model
# model.compile(loss='mse', optimizer='adam')

In [13]:
# callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.25, patience=3, verbose=0, mode="auto", min_delta=1e-8, cooldown=0, min_lr=0.0)]
# model.fit(dataset_train, steps_per_epoch=1000, validation_data=dataset_test, validation_steps=500, epochs=30, verbose=2, callbacks=callbacks)

# Train covariate model

Merge health and mortality data

In [14]:
# Reshape the data to make state, age, gender, and year indexable

age_groups = []
states = []
years = []
shaped_data = np.empty((50, 13, 2, 29))
shaped_data[:, :, :, :] = np.nan
for row in health_data:
    state = row[0]
    age = row[1]
    gender = row[2]
    year = row[3]
    bmi = row[4]

    if not state in states:
        states.append(state)
    state = states.index(state)

    if not age in age_groups:
        age_groups.append(age)
    age = age_groups.index(age)

    if not year in years:
        years.append(year)
    year = years.index(year)

    shaped_data[state, age, gender, year] = bmi

In [15]:
# Impute missing values using the mean of all states

imputed_data = shaped_data.copy()
matching = np.isnan(shaped_data)
mean_values = np.repeat(np.nanmean(shaped_data, axis=0)[np.newaxis, :, :, :], 50, 0)
imputed_data[matching] = mean_values[matching]

In [16]:
combined_data = []
for i, row in enumerate(data):
    state = row[0]
    gender = row[1]
    year = row[2]
    age = row[3]
    rate = row[4]
    
    age_index = (age - 20) // 5
    if age_index == -1:
        age_index = 0
    if age_index > 12:
        age_index = 12

    state_index = states.index(int(state))

    year_index = years.index(year)

    bmi = imputed_data[state_index, int(age_index), int(gender), year_index - 5]

    combined_data.append([state, gender, year , age, bmi, rate])

combined_data = np.array(combined_data)
print(combined_data.shape)

(229600, 6)


Train Covariate Model

In [17]:
# Normalize the bmi variable
combined_data[:, 4] = (combined_data[:, 4] - np.min(combined_data[:, 4])) / (np.max(combined_data[:, 4]) - np.min(combined_data[:, 4]))

In [18]:
# training and test sets 
training_index = np.logical_and(combined_data[:, 2] >= 1998, combined_data[:, 2] <= 2010)
training_data = combined_data[training_index, :]
print(training_data.shape)

test_index = np.logical_and(combined_data[:, 2] > 2010, combined_data[:, 2] <= 2015)
test_data = combined_data[test_index, :]

(106600, 6)


In [19]:
training_data = tf.convert_to_tensor(training_data)
test_data = tf.convert_to_tensor(test_data)
# cast tensor to type float32
training_data = tf.cast(training_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)
num_train = training_data.shape[0]
num_test = test_data.shape[0]

In [20]:
# define function to fetch and process data entries from training or test data 
def get_data(index, mode, include_year=True, include_bmi=True, include_state=True):
    if mode == "train":
        # randomly selects index from training data between 0 and num_train
        rand_index = tf.random.uniform([],minval=0, maxval=num_train, dtype=tf.int32) 
        entry = training_data[rand_index, :]
    elif mode == "not_random":
        # selects specified index from test data 
        entry = test_data[index, :]
    else: 
        # for any other value of mode, randomly selects index from test
        rand_index = tf.random.uniform([],minval=0, maxval=num_test, dtype=tf.int32)
        entry = test_data[rand_index, :]
    state, gender, year, age, bmi, rate = entry[0], entry[1], entry[2], entry[3], entry[4], entry[5]
    age = tf.cast(age, tf.int32)
    gender = tf.cast(gender, tf.int32)
    age = tf.reshape(age, [1])
    gender = tf.reshape(gender, [1])
    rate = tf.reshape(rate, [1])

     # Conditionally include bmi and state
    inputs = [age, gender]
    if include_year:
        year = (year - 1998)/21
        year = tf.reshape(year, [1])
        inputs.append(year)
    if include_bmi:
        bmi = tf.reshape(bmi, [1])
        inputs.append(bmi)
    if include_state:
        state = tf.reshape(tf.cast(state, tf.int32), [1])
        inputs.append(state)

    return tuple(inputs), rate

In [21]:
# use get_data function to set up training and test tensorflow datasets 
def create_dataset(mode, range, include_year=True, include_bmi=True, include_state=True):
    dataset = tf.data.Dataset.from_tensor_slices(np.arange(range))
    dataset = dataset.repeat()
    dataset = dataset.map(lambda x: get_data(x, mode=mode, include_year=include_year, include_bmi=include_bmi, include_state=include_state), 
                          num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(256)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

dataset_test2 = create_dataset(mode="not_random", range=68000)

# dataset_test = tf.data.Dataset.from_tensor_slices(np.arange(10000))
# dataset_test = dataset_test.repeat()
# dataset_test = dataset_test.map(lambda x: get_data(x, mode="test"), num_parallel_calls=4)
# dataset_test = dataset_test.batch(256)
# dataset_test = dataset_test.prefetch(buffer_size=512)

# dataset_test2 = tf.data.Dataset.from_tensor_slices(np.arange(68000))
# dataset_test2 = dataset_test2.map(lambda x: get_data(x, mode="not_random"), num_parallel_calls=4)
# dataset_test2 = dataset_test2.batch(256)
# dataset_test2 = dataset_test2.prefetch(buffer_size=512)

In [22]:
def redefine_model(include_year=True, include_bmi=True, include_state=True):
    # defining inputs 
    age =  tfkl.Input(shape=(1,), dtype='int32', name='Age')
    gender = tfkl.Input(shape=(1,), dtype='int32', name='Gender')
    # conditional inputs
    if include_year:
        year = tfkl.Input(shape=(1,), dtype='float32', name='Year')
    if include_state:
        state = tfkl.Input(shape=(1,), dtype='int32', name='State')
    if include_bmi:
        bmi = tfkl.Input(shape=(1,), dtype='float32', name='Bmi')

    # defining embedding layers 
    age_embed = tfkl.Embedding(input_dim=100, output_dim=5, input_length=1, name='Age_embed')(age)
    age_embed = tfkl.Flatten()(age_embed)

    gender_embed = tfkl.Embedding(input_dim=2, output_dim=5, input_length=1, name='Gender_embed')(gender)
    gender_embed = tfkl.Flatten()(gender_embed)
    
    if include_state:
        state_embed = tfkl.Embedding(input_dim=50, output_dim=5, input_length=1, name='State_embed')(state)
        state_embed = tfkl.Flatten()(state_embed)

    # create feature vector that concatenates all inputs 
    # x = tfkl.Concatenate()([year, age_embed, gender_embed, state_embed, bmi])
    x = tfkl.Concatenate()([age_embed, gender_embed] + ([year] if include_year else []) + ([state_embed] if include_state else []) + ([bmi] if include_bmi else []))
    x1 = x

    # setting up middle layers 
    x = tfkl.Dense(128, activation='tanh')(x)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Dropout(0.05)(x)

    x = tfkl.Dense(128, activation='tanh')(x)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Dropout(0.05)(x)

    x = tfkl.Dense(128, activation='tanh')(x)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Dropout(0.05)(x)

    x = tfkl.Dense(128, activation='tanh')(x)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Dropout(0.05)(x)

    # setting up output layer 
    x = tfkl.Concatenate()([x1, x])
    x = tfkl.Dense(128, activation='tanh')(x)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Dropout(0.05)(x)
    outputs = tfkl.Dense(1, activation='sigmoid', name='final')(x)

    # creating the model 
    inputs = [age, gender]
    if include_year:
        inputs.append(year)
    if include_state:
        inputs.append(state)
    if include_bmi:
        inputs.append(bmi)
    # model = tf.keras.Model(inputs=[year, age, state, gender, bmi], outputs=[x])
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    # compiling the model
    model.compile(loss='mse', optimizer='adam')

    return model

In [23]:
def get_loss(loss, val_loss):
    loss = np.array(loss)
    val_loss = np.array(val_loss)
    loss = np.mean(loss)
    val_loss = np.mean(val_loss)
    return loss, val_loss

In [25]:
def comparitive_training(specs, epochs=30, runs=5):
    results = []
    for spec_name, include_year, include_bmi, include_state in specs:
        loss = []
        val_loss = []
        dataset_train = create_dataset(mode="train", range=10000, include_year=include_year, include_bmi=include_bmi, include_state=include_state)
        dataset_test = create_dataset(mode="test", range=10000, include_year=include_year, include_bmi=include_bmi, include_state=include_state)
        for i in range(runs):
            model = redefine_model(include_year=include_year, include_bmi=include_bmi, include_state=include_state)
            callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.25, patience=3, verbose=0, mode="auto", min_delta=1e-8, cooldown=0, min_lr=0.0)]
            history = model.fit(dataset_train, steps_per_epoch=1000, validation_data=dataset_test, validation_steps=500, epochs=epochs, verbose=2, callbacks=callbacks)
            loss.append(history.history['loss'][-1])
            val_loss.append(history.history['val_loss'][-1])
        avg_loss, avg_val_loss = get_loss(loss, val_loss)
        results.append((spec_name, avg_loss, avg_val_loss))
    return results

In [None]:
# Specifications
specs = [
    ("No BMI", True, False, True),
    ("With BMI", True, True, True),
    ("No State or BMI", True, False, False),
    ("BMI, no State", True, True, False)
]

In [26]:
results = comparitive_training(specs, epochs=1, runs=1)

1000/1000 - 6s - loss: 0.0128 - val_loss: 3.2029e-04 - lr: 0.0010 - 6s/epoch - 6ms/step
1000/1000 - 6s - loss: 0.0124 - val_loss: 1.4067e-04 - lr: 0.0010 - 6s/epoch - 6ms/step
1000/1000 - 6s - loss: 0.0134 - val_loss: 4.5284e-04 - lr: 0.0010 - 6s/epoch - 6ms/step
1000/1000 - 5s - loss: 0.0132 - val_loss: 2.6595e-04 - lr: 0.0010 - 5s/epoch - 5ms/step


In [27]:
print(results)

[('No BMI', 0.012779438868165016, 0.0003202946681994945), ('With BMI', 0.012358171865344048, 0.00014066557923797518), ('No State or BMI', 0.01343628391623497, 0.00045283869258128107), ('BMI, no State', 0.013175425119698048, 0.00026595068629831076)]


In [190]:
# Generate LaTeX table
latex_table = "\\begin{table}[H]\n\\centering\n\\begin{tabular}{lcc}\n\\hline\n"
latex_table += "Model Specification & Average Loss & Average Validation Loss \\\\ \\hline\n"
for spec_name, avg_loss, avg_val_loss in results:
    latex_table += f"{spec_name} & {avg_loss:.6f} & {avg_val_loss:.6f} \\\\\n"
latex_table += "\\hline\n\\end{tabular}\n\\caption{Model Performance}\n\\end{table}"

print(latex_table)

\begin{table}[H]
\centering
\begin{tabular}{lcc}
\hline
Model Specification & Average Loss & Average Validation Loss \\ \hline
No BMI & 0.000033 & 0.000032 \\
With BMI & 0.000063 & 0.000045 \\
No State or BMI & 0.000065 & 0.000055 \\
BMI, no State & 0.000061 & 0.000053 \\
\hline
\end{tabular}
\caption{Model Performance}
\end{table}


BMI 1st run: loss: 6.5036e-05 - val_loss: 7.7524e-05 
BMI 2nd run: loss: 7.0120e-05 - val_loss: 5.9770e-05
BMI no lag: loss: 6.7715e-05 - val_loss: 5.2124e-05
No State No BMI: loss: 6.5259e-05 - val_loss: 5.4376e-05
State No BMI: loss: 3.5489e-05 - val_loss: 3.0721e-05