In [1]:
# Import Pandas as Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Load Data

In [2]:
food_access_raw_df = pd.read_csv("https://gtbootcamp20230221.s3.amazonaws.com/FoodAccessResearchAtlasData2019.csv", dtype={'CensusTract': str})
pd.set_option('display.max_rows', None)
food_access_raw_df.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,...,221.0,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,214.0,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,...,439.0,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,...,904.0,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,1126.0,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0


In [3]:
# Create a subset dataframe with selected columns
food_access_raw_df = food_access_raw_df[["CensusTract", "State", "County", "Urban", "Pop2010", "OHU2010", "PovertyRate",
                                    "MedianFamilyIncome", "LAhalfand10", "LAPOP1_10", "LAPOP05_10", "LAPOP1_20", "LALOWI1_10",
                                     "LALOWI05_10", "LALOWI1_20", "lapophalfshare", "lalowihalfshare", "lakidshalfshare",
                                     "laseniorshalfshare", "lawhitehalfshare", "lablackhalfshare", "laasianhalfshare",
                                     "lanhopihalfshare", "laaianhalfshare", "laomultirhalfshare", "lahisphalfshare", "lahunvhalfshare", "lasnaphalfshare", "lapop10share", "lalowi10share", 
                                     "lakids10share", "laseniors10share", "lawhite10share", "lablack10share",
                                     "laasian10share", "lanhopi10share", "laaian10share", "laomultir10share", "lahisp10share", "lahunv10share",
                                     "lasnap10share", "TractLOWI", "TractKids", "TractSeniors", "TractWhite",
                                     "TractBlack", "TractAsian", "TractNHOPI", "TractAIAN", "TractOMultir",
                                     "TractHispanic", "TractHUNV", "TractSNAP"]]
food_access_raw_df.tail()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,LAhalfand10,LAPOP1_10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
72526,56043000200,Wyoming,Washakie County,0,3326,1317,9.7,67254.0,1,902.0,...,593.0,3106.0,6.0,15.0,0.0,27.0,172.0,309.0,61.0,64.0
72527,56043000301,Wyoming,Washakie County,1,2665,1154,11.6,64152.0,1,,...,399.0,2377.0,5.0,23.0,0.0,40.0,220.0,446.0,88.0,41.0
72528,56043000302,Wyoming,Washakie County,1,2542,1021,16.3,69605.0,1,155.0,...,516.0,2312.0,11.0,10.0,1.0,26.0,182.0,407.0,23.0,64.0
72529,56045951100,Wyoming,Weston County,0,3314,1322,17.5,74500.0,1,840.0,...,499.0,3179.0,15.0,10.0,1.0,47.0,62.0,91.0,47.0,34.0
72530,56045951300,Wyoming,Weston County,1,3894,1699,17.3,76838.0,1,1376.0,...,650.0,3706.0,6.0,10.0,2.0,44.0,126.0,125.0,34.0,110.0


In [5]:
# Calculate percentage population of each group
food_access_raw_df[["TractLOWI_PCT", "TractKids_PCT", "TractSeniors_PCT", "TractWhite_PCT",
                "TractBlack_PCT", "TractAsian_PCT", "TractNHOPI_PCT", "TractAIAN_PCT",
                "TractOMultir_PCT", "TractHispanic_PCT"]] = food_access_raw_df[["TractLOWI", "TractKids", "TractSeniors", "TractWhite",
                                     "TractBlack", "TractAsian", "TractNHOPI", "TractAIAN", "TractOMultir",
                                     "TractHispanic"]]\
    .apply(lambda x: x/food_access_raw_df['Pop2010']*100).round(2)


# Display Dataframe
food_access_raw_df.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,LAhalfand10,LAPOP1_10,...,TractLOWI_PCT,TractKids_PCT,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT
0,1001020100,Alabama,Autauga County,1,1912,693,11.3,81250.0,1,1896.0,...,23.8,26.52,11.56,84.83,11.35,0.73,0.0,0.73,2.35,2.3
1,1001020200,Alabama,Autauga County,1,2170,743,17.9,49000.0,1,1261.0,...,36.96,27.93,9.86,40.92,56.08,0.23,0.0,0.23,2.53,3.46
2,1001020300,Alabama,Autauga County,1,3373,1256,15.0,62609.0,1,1552.0,...,38.72,26.5,13.02,76.37,19.18,0.5,0.15,0.33,3.47,2.58
3,1001020400,Alabama,Autauga County,1,4386,1722,2.8,70607.0,1,1363.0,...,21.02,23.14,20.61,93.16,4.4,0.41,0.09,0.25,1.69,1.94
4,1001020500,Alabama,Autauga County,1,10766,4082,15.2,96334.0,1,2643.0,...,20.82,29.37,10.46,80.49,13.35,2.75,0.08,0.45,2.88,3.3


In [6]:
# Calculate percentage TractHUNV and TractSNAP  group
food_access_raw_df[["TractHUNV_PCT", "TractSNAP_PCT"]] = food_access_raw_df[["TractHUNV", "TractSNAP"]]\
    .apply(lambda x: x/food_access_raw_df['OHU2010']*100).round(2)


# Display Dataframe
food_access_raw_df.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,LAhalfand10,LAPOP1_10,...,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT,TractHUNV_PCT,TractSNAP_PCT
0,1001020100,Alabama,Autauga County,1,1912,693,11.3,81250.0,1,1896.0,...,11.56,84.83,11.35,0.73,0.0,0.73,2.35,2.3,0.87,14.72
1,1001020200,Alabama,Autauga County,1,2170,743,17.9,49000.0,1,1261.0,...,9.86,40.92,56.08,0.23,0.0,0.23,2.53,3.46,11.98,21.0
2,1001020300,Alabama,Autauga County,1,3373,1256,15.0,62609.0,1,1552.0,...,13.02,76.37,19.18,0.5,0.15,0.33,3.47,2.58,7.88,13.69
3,1001020400,Alabama,Autauga County,1,4386,1722,2.8,70607.0,1,1363.0,...,20.61,93.16,4.4,0.41,0.09,0.25,1.69,1.94,1.22,5.69
4,1001020500,Alabama,Autauga County,1,10766,4082,15.2,96334.0,1,2643.0,...,10.46,80.49,13.35,2.75,0.08,0.45,2.88,3.3,5.63,8.3


In [7]:
# Create a subset dataframe with selected columns
first_food_access_df = food_access_raw_df[["Pop2010", "OHU2010", "PovertyRate",
                                    "MedianFamilyIncome", "TractLOWI", "LAhalfand10", "TractLOWI_PCT", "TractKids_PCT", "TractSeniors_PCT", "TractWhite_PCT",
                "TractBlack_PCT", "TractAsian_PCT", "TractNHOPI_PCT", "TractAIAN_PCT",
                "TractOMultir_PCT", "TractHispanic_PCT", "TractHUNV_PCT", "TractSNAP_PCT"]]
first_food_access_df.tail()

Unnamed: 0,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,TractLOWI,LAhalfand10,TractLOWI_PCT,TractKids_PCT,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT,TractHUNV_PCT,TractSNAP_PCT
72526,3326,1317,9.7,67254.0,765.0,1,23.0,26.58,17.83,93.39,0.18,0.45,0.0,0.81,5.17,9.29,4.63,4.86
72527,2665,1154,11.6,64152.0,774.0,1,29.04,25.29,14.97,89.19,0.19,0.86,0.0,1.5,8.26,16.74,7.63,3.55
72528,2542,1021,16.3,69605.0,789.0,1,31.04,24.15,20.3,90.95,0.43,0.39,0.04,1.02,7.16,16.01,2.25,6.27
72529,3314,1322,17.5,74500.0,955.0,1,28.82,19.76,15.06,95.93,0.45,0.3,0.03,1.42,1.87,2.75,3.56,2.57
72530,3894,1699,17.3,76838.0,1095.0,1,28.12,23.57,16.69,95.17,0.15,0.26,0.05,1.13,3.24,3.21,2.0,6.47


In [8]:
# Drop rows with NAN values
first_food_access_df = first_food_access_df.dropna()
 
# Reset the index
first_food_access_df = first_food_access_df.reset_index(drop = True)

first_food_access_df.tail()

Unnamed: 0,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,TractLOWI,LAhalfand10,TractLOWI_PCT,TractKids_PCT,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT,TractHUNV_PCT,TractSNAP_PCT
71777,3326,1317,9.7,67254.0,765.0,1,23.0,26.58,17.83,93.39,0.18,0.45,0.0,0.81,5.17,9.29,4.63,4.86
71778,2665,1154,11.6,64152.0,774.0,1,29.04,25.29,14.97,89.19,0.19,0.86,0.0,1.5,8.26,16.74,7.63,3.55
71779,2542,1021,16.3,69605.0,789.0,1,31.04,24.15,20.3,90.95,0.43,0.39,0.04,1.02,7.16,16.01,2.25,6.27
71780,3314,1322,17.5,74500.0,955.0,1,28.82,19.76,15.06,95.93,0.45,0.3,0.03,1.42,1.87,2.75,3.56,2.57
71781,3894,1699,17.3,76838.0,1095.0,1,28.12,23.57,16.69,95.17,0.15,0.26,0.05,1.13,3.24,3.21,2.0,6.47


In [9]:
# Get dataframe information
first_food_access_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71782 entries, 0 to 71781
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Pop2010             71782 non-null  int64  
 1   OHU2010             71782 non-null  int64  
 2   PovertyRate         71782 non-null  float64
 3   MedianFamilyIncome  71782 non-null  float64
 4   TractLOWI           71782 non-null  float64
 5   LAhalfand10         71782 non-null  int64  
 6   TractLOWI_PCT       71782 non-null  float64
 7   TractKids_PCT       71782 non-null  float64
 8   TractSeniors_PCT    71782 non-null  float64
 9   TractWhite_PCT      71782 non-null  float64
 10  TractBlack_PCT      71782 non-null  float64
 11  TractAsian_PCT      71782 non-null  float64
 12  TractNHOPI_PCT      71782 non-null  float64
 13  TractAIAN_PCT       71782 non-null  float64
 14  TractOMultir_PCT    71782 non-null  float64
 15  TractHispanic_PCT   71782 non-null  float64
 16  Trac

In [10]:
first_food_access_df['LAhalfand10'].value_counts()

1    49047
0    22735
Name: LAhalfand10, dtype: int64

In [11]:
# Split our preprocessed data into our features and target arrays
y = first_food_access_df['LAhalfand10']
X = first_food_access_df.drop(['LAhalfand10'], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 12
hidden_nodes_layer3 = 16

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(hidden_nodes_layer1, activation="relu", input_dim= 17))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(hidden_nodes_layer2, activation="relu"))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(hidden_nodes_layer3, activation="relu"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 144       
                                                                 
 dense_1 (Dense)             (None, 12)                108       
                                                                 
 dense_2 (Dense)             (None, 16)                208       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 477
Trainable params: 477
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=80)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

561/561 - 1s - loss: 0.4791 - accuracy: 0.7740 - 688ms/epoch - 1ms/step
Loss: 0.4791339635848999, Accuracy: 0.7739886045455933


In [17]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh', 'sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=17))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [18]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=2)

In [19]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=50,validation_data=(X_test_scaled,y_test))

Trial 180 Complete [00h 02m 39s]
val_accuracy: 0.7791708707809448

Best val_accuracy So Far: 0.7813997268676758
Total elapsed time: 01h 52m 24s
INFO:tensorflow:Oracle triggered exit


In [20]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 21, 'num_layers': 5, 'units_0': 16, 'units_1': 26, 'units_2': 1, 'units_3': 1, 'units_4': 11, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0069'}
{'activation': 'tanh', 'first_units': 26, 'num_layers': 2, 'units_0': 21, 'units_1': 6, 'units_2': 26, 'units_3': 26, 'units_4': 26, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0068'}
{'activation': 'relu', 'first_units': 26, 'num_layers': 1, 'units_0': 11, 'units_1': 16, 'units_2': 6, 'units_3': 16, 'units_4': 16, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0164'}


In [21]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

561/561 - 1s - loss: 0.4720 - accuracy: 0.7814 - 721ms/epoch - 1ms/step
Loss: 0.4719739258289337, Accuracy: 0.7813997268676758
561/561 - 1s - loss: 0.4711 - accuracy: 0.7801 - 740ms/epoch - 1ms/step
Loss: 0.47105151414871216, Accuracy: 0.7800624370574951
561/561 - 1s - loss: 0.4706 - accuracy: 0.7795 - 714ms/epoch - 1ms/step
Loss: 0.4705938994884491, Accuracy: 0.77950519323349
