In [1]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

%run ml_project_normalized.ipynb

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


  from pandas.core.computation.check import NUMEXPR_INSTALLED


dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned color
cleaned breed
dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned color
cleaned breed
Done running ml_project.ipynb.


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

def train_XRT_classifier(X_train, y_train, X_test, cat_cols, target_samples_per_class=3000):
  """
  Trains an Extremely Randomized Trees model using class weighting and RandomizedSearchCV.

  Parameters:
    X_train (pd.DataFrame or np.ndarray): Fully preprocessed training features.
    y_train (array-like): Encoded training labels.
    X_test (pd.DataFrame or np.ndarray): Fully preprocessed test features.
    rare_classes (list): List of int-encoded rare class labels.
    target_samples_per_class (int): Target number of samples per rare class (currently unused).

  Returns:
    best_estimator: Trained model.
    test_predictions: Predictions on the test set.
  """
  
  '''
  #  preprocessing for ensembling
    # Make copies to avoid modifying original data
  X_train = X_train.copy()
  X_test = X_test.copy()

  # Drop hour_sin and hour_cos if they exist
  drop_cols = ['hour_sin', 'hour_cos']
  existing_cols = [col for col in drop_cols if col in X_train.columns]
  '''

  all_cols = X_train.columns.tolist()

  # Convert column names to positional indices
  cat_indices = [all_cols.index(col) for col in cat_cols]

  # Compute class weights for imbalanced training set
  class_labels = np.unique(y_train)
  class_weights = compute_class_weight('balanced', classes=class_labels, y=y_train)
  class_weight_dict = dict(zip(class_labels, class_weights))
  print("Class weights:", class_weight_dict)


    # Preprocessing for categorical features: Imputation (if needed) + One-Hot Encoding
  categorical_transformer = Pipeline(steps=[
    #   ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values (if any)
      ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot Encode categorical features
  ])

  # Combine both transformations into a single ColumnTransformer
  preprocessor = ColumnTransformer(
      transformers=[
          ('cat', categorical_transformer, cat_indices)
      ])

  # Create a pipeline that first transforms the data and then applies Logistic Regression
  pipeline = Pipeline(steps=[
    #  ('smote', SMOTENC(categorical_features=cat_cols, random_state=42, sampling_strategy=sampling_strategy)),
      ('preprocessor', preprocessor),
      ('xrt', ExtraTreesClassifier(class_weight=class_weight_dict, random_state=42, n_jobs=-1))  # You can adjust max_iter as needed
  ])


  '''
  # Build model pipeline (only model step, encoding was done externally)
  model = ExtraTreesClassifier(
    n_estimators=100,
    class_weight=class_weight_dict,
    random_state=42,
    n_jobs=-1
  )
  '''

  # Define hyperparameter search space
  param_dist = {
    'xrt__n_estimators': [100, 200, 300],
    'xrt__max_depth': [10, 20, None],
    'xrt__min_samples_split': [2, 5, 10],
    'xrt__min_samples_leaf': [1, 2, 4]
  }

  search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=5,
    cv=5,
    scoring='balanced_accuracy',
    verbose=2,
    random_state=42,
    n_jobs=2
  )


  '''
  print(f"\n[INFO] Starting training with {len(X_train)} samples")
  search.fit(X_train, y_train)
  print("[INFO] Training complete.")
  print("Best parameters:", search.best_params_)
  print("Best CV accuracy:", search.best_score_)

  # Optionally evaluate again with cross_val_score if needed
  try:
    cv_scores = cross_val_score(
      search.best_estimator_,
      X_train,
      y_train,
      cv=5,
      scoring='balanced_accuracy'
    )
    print("Generalization accuracy (via cross_val_score):", cv_scores.mean())
  except Exception as e:
    print(f"Cross-validation scoring failed: {e}")
  '''

  search.fit(X_train, y_train)

  print('Best parameters:', search.best_params_)
  print('Best cross-validation accuracy:', search.best_score_)

  cv_scores = cross_val_score(search.best_estimator_, X_train, y_train, cv=5, verbose=3, scoring=balanced_acc_scorer)
  print('Generalization Balanced accuracy (via cross_val_score):', cv_scores.mean())

  # Final test prediction
  test_predictions = search.best_estimator_.predict(X_test)

  return search.best_estimator_, test_predictions


In [None]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

X_train = df_train.drop(columns=['outcome_type'])
y_train = df_train['outcome_type']
X_test = df_test

# label coder will only be used for the outcome labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print('Encoding mapping:', le.classes_)

# Define rare classes that need oversampling 
rare_classes = [
  label for label, count in pd.Series(y_train).value_counts().items()
  if count < 0.05 * len(y_train)
]
print("Rare classes:")
for cls in rare_classes:
  print(f"  {cls}: {le.classes_[cls]}")

# Define categorical features - either column names or column indices
cat_cols_onehot = ['intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake']
cat_cols_freq   = ['primary_color']

# Frequency encode selected high-cardinality features
for col in cat_cols_freq:
  # Fit on train, apply same mapping to test
  freq_map = X_train[col].value_counts()
  X_train[col] = X_train[col].map(freq_map)
  # Unseen categories in test get frequency 0
  X_test[col]  = X_test[col].map(freq_map).fillna(0)

# One-hot encode remaining categorical features
X_train = pd.get_dummies(X_train, columns=cat_cols_onehot, drop_first=True)
X_test  = pd.get_dummies(X_test,  columns=cat_cols_onehot, drop_first=True)
# Align columns: add missing columns in X_test, fill with 0s
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Train model and get test predictions
best_model, test_predictions = train_XRT_classifier(
  X_train=X_train,
  y_train=y_train,
  X_test=X_test,
#  rare_classes=rare_classes
)

# Decode integer predictions back to string labels
decoded_preds = le.inverse_transform(test_predictions)

# Save predictions using your utility function
save_predictions(decoded_preds, model_name='x_rand_trees')

Encoding mapping: ['Adoption' 'Died' 'Euthanasia' 'Return to Owner' 'Transfer']
Rare classes:
  2: Euthanasia
  1: Died
Class weights: {0: 0.40387689848121505, 1: 21.35542747358309, 2: 6.445636416352566, 3: 1.3394589383623547, 4: 0.6347361809045227}

[INFO] Starting training with 111155 samples
Fitting 5 folds for each of 5 candidates, totalling 25 fits


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED


KeyboardInterrupt: 