### This notebook contains the code to generate the submission for the "Pump it Up: Data Mining the Water Table" competition.

We use the preprocessed training data and corresponding values, as well as test data. We need to predict the ordinal variable 'status_group', with values 0, 1, 2. The error metric used in the competition is the classification rate (fraction of predictions that are correct).

In this script we train a pytorch deep learning model. We check if it performs the same as the tensorflow model, for its best parameters of 256 units and 4 layers.

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
X_train = pd.read_csv('../prep_data/X_train.csv')
y_train = pd.read_csv('../prep_data/y_train.csv')
X_val = pd.read_csv('../prep_data/X_val.csv')
y_val = pd.read_csv('../prep_data/y_val.csv')

# Prepare data for PyTorch
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).squeeze()  # Ensure it's 1D
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long).squeeze()  # Ensure it's 1D

# Create DataLoader instances
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [3]:
# Define the model
class MyModel(nn.Module):
    def __init__(self, input_dim, n_units, n_layers, output_dim=3):
        super(MyModel, self).__init__()

        # First layer
        self.layers = nn.Sequential(
            nn.Linear(input_dim, n_units),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(n_units)
        )

        # Additional layers
        for _ in range(n_layers - 1):
            self.layers.append(nn.Linear(n_units, n_units))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(0.3))
            self.layers.append(nn.BatchNorm1d(n_units))

        # Output layer
        self.output_layer = nn.Linear(n_units, output_dim)

    def forward(self, x):
        x = self.layers(x)
        x = self.output_layer(x)
        return nn.functional.softmax(x, dim=1)

input_dim = X_train.shape[1]
output_dim = 3  # 3 classes, one-hot encoded 0,1,2
n_units = 256
n_layers = 4
model = MyModel(input_dim, n_units, n_layers, output_dim)
model

MyModel(
  (layers): Sequential(
    (0): Linear(in_features=194, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.3, inplace=False)
    (7): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Linear(in_features=256, out_features=256, bias=True)
    (9): ReLU()
    (10): Dropout(p=0.3, inplace=False)
    (11): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): Linear(in_features=256, out_features=256, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.3, inplace=False)
    (15): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (output_layer): Linear(in_features=256, out_features=3, bias=True)
)

In [4]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Early stopping
patience = 10
min_delta = 0.001
best_loss = float('inf')
patience_counter = 0

# Training loop
for epoch in range(1000):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    # Early stopping logic
    if val_loss < best_loss - min_delta:
        best_loss = val_loss
        patience_counter = 0
        best_model_state = model.state_dict()
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered")
        break

# Load the best model state
model.load_state_dict(best_model_state)

Early stopping triggered


<All keys matched successfully>

In [10]:
# Function to predict on validation data
def predict(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            # Check whether we have validation data with labels or unlabeled test data
            if isinstance(batch, tuple) or isinstance(batch, list):
                # If the batch is a tuple or list, the first element is inputs, second labels
                inputs = batch[0]
            else:
                # If the batch is not a tuple or list, it's just the inputs
                inputs = batch

            outputs = model(inputs)
            predictions = torch.argmax(outputs, dim=1)  # Get predicted class indices
            all_predictions.extend(predictions.cpu().numpy().tolist())  # Convert to list of integers

    return all_predictions

y_pred = predict(model, val_loader)
class_rate = np.mean(y_pred == y_val.values.ravel())
print(f"Classification rate: {class_rate}")

Classification rate: 0.7237373737373738


In [None]:
# Load test data
X_test = pd.read_csv('../prep_data/X_test.csv')

# Prepare submission
output = pd.DataFrame(X_test["id"])
X_test.drop(columns=["id"], inplace=True)

# Prepare data for PyTorch
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

y_test = predict(model, test_loader)
output["status_group"] = y_test
# Map to right strings again
output["status_group"] = output["status_group"].map({0: "non functional", 1: "functional needs repair", 2: "functional"})
output.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional


In [None]:
# Save to csv
output.to_csv('../submissions/submission_deep_pt.csv', index=False)

### Final note:

After submission, the resulting score was  0.7184. This is fairly close to the score we got on the validation data here. It is a bit worse than the tensorflow model with the same hyperparameters though (and much worse than XGBoost). But it could of course be possible that pytorch performs better with a different set of hyperparameters, although we won't be testing that here.