In [1]:
%pip install nbformat
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

%run ml_project_normalized.ipynb

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
Done running ml_project.ipynb.


In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer, balanced_accuracy_score

def train_sk_classifier(X_train, y_train, X_test):
  """
  Trains an MLP neural network model using a pipeline that includes:
  frequency encoding, one-hot encoding, and hyperparameter tuning.

  Parameters:
      X_train (pd.DataFrame): Training features.
      y_train (pd.Series or np.array): Training target values.
      X_test (pd.DataFrame): Test features.

  Returns:
      best_estimator: Best estimator from RandomizedSearchCV.
      test_predictions: Predicted labels for X_test.
  """
  
  # Construct pipeline:
  pipeline = Pipeline([
    ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('mlp', MLPClassifier(max_iter=300, random_state=42))
  ])

  # Define param grid for MLPClassifier
  param_distributions = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.0001, 0.001],
    'mlp__learning_rate': ['constant', 'adaptive']
  }

  # Balanced accuracy scorer
  balanced_acc = make_scorer(balanced_accuracy_score)

  randomized_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring=balanced_acc,
    verbose=3,
    n_jobs=-1
  )

  randomized_search.fit(X_train, y_train)

  print('Best parameters:', randomized_search.best_params_)
  print('Best cross-validation balanced accuracy:', randomized_search.best_score_)

  cv_scores = cross_val_score(
    randomized_search.best_estimator_,
    X_train, y_train,
    cv=5,
    scoring=balanced_acc,
    verbose=3
  )
  print('Generalization balanced accuracy (via cross_val_score):', cv_scores.mean())

  test_predictions = randomized_search.predict(X_test)

  # Save results to CSV
  df_preds = pd.DataFrame({
    'Id': range(1, len(test_predictions) + 1),
    'Outcome Type': test_predictions
  })
  df_preds.to_csv('neural_net_preds.csv', index=False)
  print('Predictions saved to neural_net_preds.csv')


  return randomized_search.best_estimator_, test_predictions

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import balanced_accuracy_score
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd

class SimpleMLP(nn.Module):
  def __init__(self, input_dim, hidden_layers, output_dim, activation='relu'):
    super(SimpleMLP, self).__init__()
    layers = []
    prev_dim = input_dim
    for h in hidden_layers:
      layers.append(nn.Linear(prev_dim, h))
      layers.append(nn.ReLU() if activation == 'relu' else nn.Tanh())
      prev_dim = h
    layers.append(nn.Linear(prev_dim, output_dim))
    self.model = nn.Sequential(*layers)

  def forward(self, x):
    return self.model(x)

def train_classifier(X_train, y_train, X_test):
  """
  Trains a PyTorch MLP using frequency + one-hot encoded features.

  Parameters:
      X_train (pd.DataFrame): Training features.
      y_train (np.ndarray): Encoded target labels.
      X_test (pd.DataFrame): Test features.

  Returns:
      best_model: Trained PyTorch model.
      test_predictions: Model predictions on test data.
  """

#   # TODO: remove when running on condor
#   sample_indices = np.random.choice(len(X_train), size=1000, replace=False)
#   X_train = X_train.iloc[sample_indices].reset_index(drop=True)
#   y_train = y_train[sample_indices]

  X_train_freq = apply_freq_encode(X_train)
  X_test_freq = apply_freq_encode(X_test)

  # One Hot Encoding
  encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
  X_train_encoded = encoder.fit_transform(X_train_freq)
  X_test_encoded  = encoder.transform(X_test_freq)

  input_dim  = X_train_encoded.shape[1]
  output_dim = len(np.unique(y_train))

  # Using PyTorch sensors
  X_train_tensor = torch.tensor(X_train_encoded, dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train, dtype=torch.long)
  X_test_tensor  = torch.tensor(X_test_encoded, dtype=torch.float32)

  dataset = TensorDataset(X_train_tensor, y_train_tensor)
  dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

  # Train with random search
  param_grid = {
    'hidden_layer_sizes': [[50], [100], [100, 50]],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate': [0.01, 0.001],
  }

  param_list = list(ParameterSampler(param_grid, n_iter=10, random_state=42))

  best_score = -1
  best_model = None

  for params in param_list:
    model = SimpleMLP(input_dim, params['hidden_layer_sizes'], output_dim, params['activation'])
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['alpha'])
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(20):
      for xb, yb in dataloader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()

    # Evaluate model
    model.eval()
    with torch.no_grad():
      y_pred = model(X_train_tensor).argmax(dim=1).numpy()
      score = balanced_accuracy_score(y_train, y_pred)

    if score > best_score:
      best_score = score
      best_model = model

  print("Best cross-validation balanced accuracy:", best_score)

  # predict and save 
  best_model.eval()
  with torch.no_grad():
    test_predictions = best_model(X_test_tensor).argmax(dim=1).numpy()

  df_preds = pd.DataFrame({
    'Id': range(1, len(test_predictions) + 1),
    'Outcome Type': test_predictions
  })
  df_preds.to_csv('neural_net_preds.csv', index=False)
  print('Predictions saved to neural_net_preds.csv')

  return best_model, test_predictions


In [4]:
from sklearn.preprocessing import LabelEncoder
# For Dog:
train_dog = df_train[df_train['animal_type'] == 'Dog'].copy()
X_train_dog = train_dog.drop(columns=['animal_type', 'outcome_type'])
y_train_dog = train_dog['outcome_type']

test_dog = df_test[df_test['animal_type'] == 'Dog'].copy()
X_test_dog = test_dog.drop(columns=['animal_type'])

# For Cat:
train_cat = df_train[df_train['animal_type'] == 'Cat'].copy()
X_train_cat = train_cat.drop(columns=['animal_type', 'outcome_type'])
y_train_cat = train_cat['outcome_type']

test_cat = df_test[df_test['animal_type'] == 'Cat'].copy()
X_test_cat = test_cat.drop(columns=['animal_type'])

## Encode targets with LabelEncoder
# Dog encoding
le_dog = LabelEncoder()
y_train_dog_encoded = le_dog.fit_transform(y_train_dog)

# Cat encoding
le_cat = LabelEncoder()
y_train_cat_encoded = le_cat.fit_transform(y_train_cat)

print("Training model for Dog data:")
best_estimator_dog, dog_predictions_encoded = train_classifier(X_train_dog, y_train_dog_encoded, X_test_dog)
dog_predictions = le_dog.inverse_transform(dog_predictions_encoded)

print("\nTraining model for Cat data:")
best_estimator_cat, cat_predictions_encoded = train_classifier(X_train_cat, y_train_cat_encoded, X_test_cat)
cat_predictions = le_cat.inverse_transform(cat_predictions_encoded)

combine_predictions(dog_predictions, cat_predictions, X_test_dog, X_test_cat)

Training model for Dog data:


KeyboardInterrupt: 