In [None]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

%run ml_project_normalized.ipynb

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier  # Import the Balanced Random Forest
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

In [None]:
def train_bal_RF_classifier(X_train, y_train, X_test, cat_cols, target_samples_per_class=3000):
    all_cols = X_train.columns.tolist()
    cat_indices = [all_cols.index(col) for col in cat_cols]

    # Define categorical transformer with one-hot encoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot Encode categorical features
    ])

    # Combine transformations into a single preprocessor
    preprocessor = ColumnTransformer(
        transformers=[('cat', categorical_transformer, cat_indices)]
    )

    # Create pipeline with the BalancedRandomForestClassifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('brf', BalancedRandomForestClassifier(random_state=42, n_jobs=-1))  # Balanced Random Forest
    ])

    # Define hyperparameter search space
    param_dist = {
        'brf__criterion' : ['entropy', 'gini'],
        'brf__n_estimators': [100, 200, 300],
        'brf__max_depth': [10, 20, None], 
        'brf__min_samples_split': [2, 5, 10],
        'brf__min_samples_leaf': [1, 2, 4],
        'brf__max_features' : [None, 2, 3, 5, 'auto', 'log2']
    }

    # Randomized Search with cross-validation
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=10,
        cv=skf,  # Adjust the number of folds for cross-validation if necessary
        scoring='balanced_accuracy',
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    print(f"\n[INFO] Starting training with {len(X_train)} samples")
    search.fit(X_train, y_train)
    print("[INFO] Training complete.")

    print("Best parameters:", search.best_params_)
    print("Best CV balanced accuracy:", search.best_score_)

    # Optionally evaluate again with cross_val_score if needed
    try:
        cv_scores = cross_val_score(
            search.best_estimator_,
            X_train,
            y_train,
            cv=skf,  # Number of cross-validation splits
            scoring='balanced_accuracy',
            verbose=3
        )
        print("Generalization accuracy (via cross_val_score):", cv_scores.mean())
    except Exception as e:
        print(f"Cross-validation scoring failed: {e}")

    # Final test prediction
    test_predictions = search.best_estimator_.predict(X_test)


    # tried to figure out feature importance, didnt work
    '''
      # Get the feature importances
    importances = search.best_estimator_.feature_importances_

    # Plotting feature importance
    indices = importances.argsort()[::-1]  # Sort feature importances in descending order
    top_n = 10  # Number of top features to plot
    plt.figure(figsize=(10, 6))
    plt.title("Top 10 Feature Importances")
    plt.bar(range(top_n), importances[indices[:top_n]], align="center")
    plt.xticks(range(top_n), X_train.columns[indices[:top_n]], rotation=45)
    plt.tight_layout()
    plt.show()
    '''
    return search.best_estimator_, test_predictions

In [None]:
# wanted to try alternate features to test on
df_train = bucket_seasons(df_train)
df_test = bucket_seasons(df_test)

df_train = bucket_days(df_train)
df_test = bucket_days(df_test)

df_train['breed'] = df_train['breed'].astype(str)
df_test['breed'] = df_test['breed'].astype(str)

df_train = df_train.drop(columns=['size', 'intake_hour', 'intake_month'])
df_test = df_test.drop(columns=['size', 'intake_hour', 'intake_month'])

In [None]:
cat_cols = ['intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake', 'breed','season', 'is_mix', 'time_of_day', 'primary_color'] # experimenting with droping color
num_cols = ['log_age', 'intake_year']    # Replace with your actual numerical columns
# freq_cols = ['primary_color'] # for trees

In [None]:
X = df_train.drop(columns=['outcome_type'])
y = df_train['outcome_type']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [None]:
# Encode the target variable.
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
print('Encoding mapping:', le.classes_)

best_model, test_predictions = train_bal_RF_classifier(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    cat_cols=cat_cols
)


# Apply thresholds to the predicted probabilities
# thresholded_preds = apply_thresholds(y_val_proba, thresholds)

# Convert predictions back to original labels.
classification_report_with_accuracy_score(y_test, test_predictions)
#print(importance)

Encoding mapping: ['Adoption' 'Died' 'Euthanasia' 'Return to Owner' 'Transfer']

[INFO] Starting training with 88924 samples
Fitting 5 folds for each of 10 candidates, totalling 50 fits


Exception ignored in: Could not import runpy module
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 19, in <module>
  File "/usr/lib/python3.8/runpy.py", line 185, in _run_module_as_main
Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 185, in _run_module_as_main
<function _releaseLock at 0x7f7414cae1f0>
Traceback (most recent call last):
  File "/usr/lib/python3.8/logging/__init__.py", line 227, in _releaseLock
    from pkgutil import read_code, get_importer
  File "/usr/lib/python3.8/pkgutil.py", line 137, in <module>
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.8/runpy.py", line 111, in _get_module_details
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.8/runpy.py", line 111, in _get_module_details
    __import__(pkg_name)
  File "/u/nneoma/.local/lib/python3.8/site-packages/joblib/__init__.py", l

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGINT(-2), SIGINT(-2), SIGINT(-2), SIGINT(-2), SIGINT(-2), EXIT(1), SIGINT(-2)}