### This notebook contains the code to generate the submission for the "Pump it Up: Data Mining the Water Table" competition.

We use the preprocessed training data and corresponding values, as well as test data. We need to predict the ordinal variable 'status_group', with values 0, 1, 2. The error metric used in the competition is the classification rate (fraction of predictions that are correct).

In this script we train a tensorflow deep learning model.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
tf.random.set_seed(42)  # For reproducibility, but it only works to like 2 digits

In [None]:
X_train = pd.read_csv('../prep_data/X_train.csv')
y_train = pd.read_csv('../prep_data/y_train.csv')
X_val = pd.read_csv('../prep_data/X_val.csv')
y_val = pd.read_csv('../prep_data/y_val.csv')

In [4]:
# Define the model
n_units = 64
n_layers = 2
model = keras.Sequential()
# Add the first layer based on input shape
model.add(layers.Dense(n_units, activation="relu", input_shape=[X_train.shape[1]]))
model.add(layers.Dropout(0.3))
model.add(layers.BatchNormalization())
# Add the remaining layers
for _ in range(n_layers-1):
    model.add(layers.Dense(n_units, activation="relu"))
    model.add(layers.Dropout(0.3))
    model.add(layers.BatchNormalization())
# Add the output layer, for classification 0,1,2
model.add(layers.Dense(3, activation="softmax"))

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Set early stopping
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

# Fit the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0
)

In [5]:
y_pred = model.predict(X_val)
y_pred = np.round(y_pred)

# Translate the one-hot encoding to the class
y_pred_class = np.argmax(y_pred, axis=1)

class_rate = np.mean(y_pred_class == y_val.values.ravel())
print(f"Classification rate: {class_rate}")

Classification rate: 0.7425084175084176


In [6]:
# Optimise
def fit_model(n_units, n_layers):
    model = keras.Sequential()
    model.add(layers.Dense(n_units, activation="relu", input_shape=[X_train.shape[1]]))
    model.add(layers.Dropout(0.3))
    model.add(layers.BatchNormalization())
    for _ in range(n_layers-1):
        model.add(layers.Dense(n_units, activation="relu"))
        model.add(layers.Dropout(0.3))
        model.add(layers.BatchNormalization())
    model.add(layers.Dense(3, activation="softmax"))

    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=512,
        epochs=1000,
        callbacks=[early_stopping],
        verbose=0
    )

    y_pred = model.predict(X_val)
    y_pred = np.round(y_pred)
    y_pred_class = np.argmax(y_pred, axis=1)
    class_rate = np.mean(y_pred_class == y_val.values.ravel())
    return class_rate

param_grid = {
    'n_units': [64, 128, 256, 512],
    'n_layers': [1, 2, 3, 4]
}

scores = []
for n_units in param_grid['n_units']:
    for n_layers in param_grid['n_layers']:
        class_rate = fit_model(n_units, n_layers)
        print(f"n_units: {n_units}, n_layers: {n_layers}, class_rate: {class_rate}")
        scores.append(class_rate)
print(max(scores))

n_units: 64, n_layers: 1, class_rate: 0.7511784511784512
n_units: 64, n_layers: 2, class_rate: 0.7563973063973064
n_units: 64, n_layers: 3, class_rate: 0.7442760942760943
n_units: 64, n_layers: 4, class_rate: 0.7510942760942761
n_units: 128, n_layers: 1, class_rate: 0.7548821548821549
n_units: 128, n_layers: 2, class_rate: 0.7452861952861953
n_units: 128, n_layers: 3, class_rate: 0.7497474747474747
n_units: 128, n_layers: 4, class_rate: 0.7474747474747475
n_units: 256, n_layers: 1, class_rate: 0.7527777777777778
n_units: 256, n_layers: 2, class_rate: 0.7582491582491583
n_units: 256, n_layers: 3, class_rate: 0.7538720538720539
n_units: 256, n_layers: 4, class_rate: 0.7592592592592593
n_units: 512, n_layers: 1, class_rate: 0.7487373737373737
n_units: 512, n_layers: 2, class_rate: 0.7489057239057239
n_units: 512, n_layers: 3, class_rate: 0.7536195286195286
n_units: 512, n_layers: 4, class_rate: 0.757996632996633
0.7592592592592593


In [7]:
# The best score was achieved with n_units=256 and n_layers=4, by a small margin
# Let's then also check if adding even more layers helps
param_grid = {
    'n_units': [256],
    'n_layers': [5, 6]
}

scores = []
for n_units in param_grid['n_units']:
    for n_layers in param_grid['n_layers']:
        class_rate = fit_model(n_units, n_layers)
        print(f"n_units: {n_units}, n_layers: {n_layers}, class_rate: {class_rate}")
        scores.append(class_rate)
print(max(scores))

n_units: 256, n_layers: 5, class_rate: 0.757996632996633
n_units: 256, n_layers: 6, class_rate: 0.757996632996633
0.757996632996633


In [8]:
# Seems like n_units=256 and n_layers=4 is still the best, but the difference is small
# Let's then use this model to predict the test data
model_fin = keras.Sequential()
model_fin.add(layers.Dense(256, activation="relu", input_shape=[X_train.shape[1]]))
model_fin.add(layers.Dropout(0.3))
model_fin.add(layers.BatchNormalization())
for _ in range(4):
    model_fin.add(layers.Dense(256, activation="relu"))
    model_fin.add(layers.Dropout(0.3))
    model_fin.add(layers.BatchNormalization())
model_fin.add(layers.Dense(3, activation="softmax"))

model_fin.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model_fin.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0
)

y_pred = model_fin.predict(X_val)
y_pred = np.round(y_pred)
y_pred_class = np.argmax(y_pred, axis=1)
class_rate = np.mean(y_pred_class == y_val.values.ravel())
print(f"Classification rate: {class_rate}")

Classification rate: 0.7587542087542087


In [None]:
# Load test data
X_test = pd.read_csv('../prep_data/X_test.csv')

# Prepare submission
output = pd.DataFrame(X_test["id"])
X_test.drop(columns=["id"], inplace=True)

y_test = model_fin.predict(X_test)
y_test = np.round(y_test)
y_test = np.argmax(y_test, axis=1)
output["status_group"] = y_test
# Map to right strings again
output["status_group"] = output["status_group"].map({0: "non functional", 1: "functional needs repair", 2: "functional"})
output.head()



Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [None]:
# Save to csv
output.to_csv('../submissions/submission_deep_tf.csv', index=False)

### Final note:

After submission, the resulting score was 0.7568. This is quite close to the score we got on the validation data here. It is also slightly worse than the XGBoost model.