In [1]:
# The purpose of this notebook is to explore hyperparameter optimization with
# Keras, using the crop rotation implementation as the base
# https://keras.io/guides/keras_tuner/getting_started/

import pandas as pd
import numpy as np

# import keras
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, GlobalMaxPooling1D

import keras_tuner as kt
from tensorflow import keras

In [2]:
df = pd.read_csv('../data/2008_2018_CDL.csv')

In [3]:
def isolate_years_from_CDL_data(CDL_data):
    """
    Uses regex and filter to isolate yearly data from CDL dataset
    imported with provided SQL query
    
    Arguments:
        CDL_data (pd.DataFrame) : a 2d pandas dataframe with yearly
        CDL data; columns must include year e.g. '2010', rows are 
        field observations
        
    Returns:
        (pd.DataFrame) : a 2d pandas dataframe with only the yearly CDL
        crop data
    """
    return CDL_data.filter(regex = '20\d\d', axis=1)

def remove_ignore_classes_from_yearly_CDL_data(yearly_CDL_data):
    """
    Removes (hard-coded) non-crop classes from yearly CDL dataset
    
    Arguments:
        yearly_CDL_data (pd.DataFrame) : a 2d pandas dataframe with
        yearly CDL data, as generated by isolate_years_from_CDL_data;
        columns are years e.g. '2010', rows are field observations
        
    Returns:
        cleaned_CDL_data (pd.DataFrame) : a 2d pandas dataframe that
        removes observations with ignore_classes    
    """
    ignore_classes = range(81, 196) 
    
    return yearly_CDL_data[~yearly_CDL_data.isin(ignore_classes)].dropna().astype(int)
    
    
def consolidate_nonsupported_crops_to_fallow_from_clean_CDL_data(cleaned_CDL_data):
    """
    Consolidates non-supported crops to fallow class
    
    Arguments:
        cleaned_CDL_data (pd.Dataframe) : a 2d pandas dataframe with cleaned, yearly
        CDL data, as generated by remove_ignore_classes_from_yearly_CDL_data; columns
        are years e.g. '2010', rows are field observations
    
    Returns:
        consoildated_CDL_data (pd.DataFrame) : a 2d pandas dataframe that
        only contains supported crops and fallow classes
     
    Notes :
    
    The non-supported crops consolidated into `fallow` (i.e. `61` designation)
    crop codes are : 
    11 : Tobacco, 13 : Popcorn, 14 : Mint, 25 : Other Small Grains, 26 : DblCrop WW/Soy,
    27 : Rye, 29 : Millet, 30 : Speltz, 32 : Flaxseed, 33 : Safflower, 35 : Mustard,
    37 : Other Hay/Non Alfalfa, 38 : Camelina, 39 : Buckwheat, 42 : Dry Beans, 44 : Other Crops,
    46 : Sweet Potatoes, 47 : Misc Vegs and Fruits, 48 : Watermelons, 49 : Onions,
    50 : Cucumbers, 51 : Chick Peas, 52 : Lentils, 53 : Peas, 55 : Caneberries, 56 : Hops,
    57 : Herbs, 58 : Clover/Wildflowers, 59 : Sod/Grass Seed, 60 : Swithgrass, 
    61 : Fallow/Idle Cropland, 63 : Forest, 64 : Shrubland, 65 : Barren, 66 Cherries, 
    67 : Peaches, 68 : Apples, 69 : Grapes, 70 : Christmas Trees, 71 : Other Tree Crops,
    72 : Citrus, 74 : Pecans, 75 : Almonds, 76 : Walnuts, 77 : Pears, 81 : Clouds/No Data,\
    82 : Developed, 83 : Water, 87 : Wetlands, 88 : Nonag/Undefined, 92 : Aquaculture,
    111 : Open Water, 112 : Perennial Ice/Snow, 121 : Developed/Open Space,
    122 : Developed/Low Intensity, 123 : Developed/Med Intensity, 124 : Developed/High Intensity,
    131 : Barren, 141 : Deciduous Forest, 142 : Evergreen Forest, 143 : Mixed Forest,
    152 : Shrubland, 176 : Grassland/Pasture, 190 : Woody Wetlands, 195 : Herbaceous Wetlands,
    204 : Pistacios, 205 : Triticale, 206 : Carrots, 207 : Asparagus, 208 : Garlic,
    209 : Canteloups, 210 : Prunes, 211 : Olives, 212 : Oranges, 213 : Honeydew Melons,
    214 : Broccoli, 216 : Peppers, 217 : Pomegranates, 218 : Nectarines, 219 : Greens,
    220 : Plums, 221 :  Strawberries, 222 : Squash, 223 : Apricots, 224 : Vetch,
    225 : Dbl Crop WW/Corn, 226 : Dbl Cropo Oats/Corn, 227 : Lettuce, 229 : Pumpkins,
    230 : Dbl Crop Lettuce/Durum Wht, 231 : Dbl Crop Lettuce/Cantaloupe, 
    232 : Dbl Crop Lettuce/Cotton, 233 : Dbl Crop Lettuce/Barley, 234 : Dbl Crop Durum Wht/Sorghum,
    235 : Dbl Crop Barley/Sorghum, 236 : Dbl Crop WinWht/Sorghum, 237 : Dbl Crop Barley/Corn,
    238 : Dbl Crop WinWht/Cotton, 239 : Dbl Crop Soybeans/Cotton, 240 : Dbl Crop Soybeans/Oats,
    241 : Dbl Crop Corn/Soybeans, 242 : Blueberries, 243 : Cabbage, 244 : Cauliflower, 245 : Celery,
    246 : Radishes, 247 : Turnips, 248 : Eggplants, 249 : Gourds, 250 : Cranberries, 254 : Dbl Crop Barley/Soybeans

    """
    
    return cleaned_CDL_data.replace([11, 13, 14, 25, 26, 27, 29, 30, 32, 33, 35, 37, 38, 39,
                               42, 44, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58,
                               59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74,
                               75, 76, 77, 81, 82, 83, 87, 88, 92, 111, 112, 121, 122,
                               123, 124, 131, 141, 142, 143, 152, 176, 190, 195, 204,
                               205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,
                               216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226,
                               227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
                               239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
                               250, 254], 61)

def create_dummies_for_train_and_testing(train_data, test_data):
    """
    Creates dummy categorical variables for validation and testing from CDL categories
    
    Arguments :
        train_data (pd.series) : a 1d pandas series of integer encoded
        categorical labels (i.e. y) that will be used for training
        test_data (pd.series) : a 1d pandas series of integer encoded categorical 
        lables (i.e. y) that will be use for testing
        
    Returns :
        train_y (pd.Series) : a 2d pandas dataframe of one-hot encoded
        categorical lables for training
        test_y (pd.Series) : a 2d pandas dataframe of one-hot encoded
        categorical lables for testing
        
    Notes
    
    Pool both target sets (i.e. training and testing targets) into one set in order to get all possible dummies given
    the data, We should probably add an additional category at somepoint to handle unknown classifications
    """
    integer_encoded_labels = pd.concat([train_data, test_data])
    dummy_lables = pd.get_dummies(integer_encoded_labels)
    train_y = dummy_lables.iloc[:train_data.shape[0],:]
    test_y = dummy_lables.iloc[train_data.shape[0]:,:]
    
    return train_y, test_y
    

In [4]:
yearly_CDL_data = isolate_years_from_CDL_data(df)
cleaned_CDL_data = remove_ignore_classes_from_yearly_CDL_data(yearly_CDL_data)
consolidated_CDL_data = consolidate_nonsupported_crops_to_fallow_from_clean_CDL_data(cleaned_CDL_data)

"""
Split data into train and test sets, in this case, 2008-2017 for the train set (i.e. using 2008-2016 data to 
predict 2017) and 2009-2018 for the test set (i.e. i.e. using 2009-2017 data to predict 2018). While this 
makes a suboptimal divide for training/testing of 50/50, it was empirically determined to generate the highest
accuracy. As future years are added, this should be rechecked
"""
train_y, test_y = create_dummies_for_train_and_testing(consolidated_CDL_data['2017'],
                                                         consolidated_CDL_data['2018'])

train_x = consolidated_CDL_data.drop('2018', axis = 1)
test_x = consolidated_CDL_data.drop('2008', axis = 1)


In [11]:

# Set parameters of the Deep Convolutional Temporal Network (DCTN)

max_features = 62 # max class/category value; used for defining spatial embeddings
embedding_dims = 10 # number of divisions within classes for converting from categorical to continuous classes
maxlen = 9 # maximum length of the input, in this case number of years in the range
filters = 50 # empirically determined for highest accuracy/computation time trade-off
kernel_size = 5 # maximum length for facilitating two 1D Convolusional Layers
hidden_dims = 200 # empirically determined for highest accuracy/computation time trade-off
batch_size = 200 # empirically determined for optimal performance given current set
epochs = 1 # learning converges earlier, but 10 provides a safe cushion for this set

# Define DCTN architecture
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length = maxlen))

model.add(Conv1D(filters, kernel_size, padding = 'valid', activation = 'relu', 
                 strides = 1, kernel_initializer = 'glorot_normal'))
model.add(Dropout(rate = 0.1))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims, activation = 'relu', kernel_initializer = 'glorot_normal'))
model.add(Dropout(rate = 0.2))

model.add(Dense(train_y.shape[1], activation ='softmax'))
model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 9, 10)             620       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 5, 50)             2550      
_________________________________________________________________
dropout_3 (Dropout)          (None, 5, 50)             0         
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               10200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 21)               

In [13]:
model.fit(trainx.drop('2017', axis = 1), train_y, batch_size = batch_size, epochs = 1, validation_split = 0.2)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1955996 samples, validate on 488999 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f99600bedd0>

In [5]:
def model_builder(hp):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(62, 10, input_length = 9))
    model.add(keras.layers.Conv1D(50, 5, padding = 'valid', activation = 'relu', 
                     strides = hp.Choice('units', [1,2,3]), kernel_initializer = 'glorot_normal'))
    model.add(keras.layers.Dropout(rate = 0.1))

    model.add(keras.layers.GlobalMaxPooling1D())

    model.add(keras.layers.Dense(200, activation = 'relu', kernel_initializer = 'glorot_normal'))
    model.add(keras.layers.Dropout(rate = 0.2))

    model.add(keras.layers.Dense(train_y.shape[1], activation ='softmax'))
    #model.summary()
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
#sparse_categorical_crossentropy
    return model

In [12]:
# tuner = kt.RandomSearch(model_builder, objective = 'val_loss', max_trials = 5, directory='../data/')

tuner = kt.RandomSearch(
    hypermodel=model_builder,
    # No objective to specify.
    # Objective is the return value of `HyperModel.fit()`.
    max_trials=3,
    overwrite=True,
    directory="my_dir",
    project_name="custom_eval",
)

In [13]:
tuner.search(train_x.drop('2017', axis = 1).loc[:1000], train_y.loc[:1000], epochs=1, validation_split = 0.2)
best_model = tuner.get_best_models()[0]

#NOT SUPPORTED IN TF 2.0, please create and compile the model under distribution strategy scope instead of passing it to compile

#https://www.tensorflow.org/guide/distributed_training#using_tfdistributestrategy_with_keras


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
1                 |?                 |units

Train on 308 samples, validate on 78 samples

AttributeError: 'Sequential' object has no attribute 'distribute_strategy'

In [21]:
train_x.drop('2017', axis = 1).loc[:1000]

Unnamed: 0,2008,2009,2010,2011,2012,2013,2014,2015,2016
1,61,61,1,5,1,5,5,5,1
2,61,61,1,5,1,5,5,5,5
3,5,5,61,5,5,5,5,5,5
4,5,5,5,1,1,1,5,5,5
5,5,5,5,5,1,5,61,5,5
...,...,...,...,...,...,...,...,...,...
995,61,61,61,61,61,61,61,61,61
996,61,61,61,61,61,61,61,61,61
997,61,61,61,61,61,61,61,61,61
998,61,61,61,61,61,61,61,61,61
