In [None]:
%pip install nbformat
%pip install imbalanced-learn

In [None]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier

def train_classifier(X_train, y_train, X_test, rare_classes, categorical_features):
  """
  Trains a CatBoost model using SMOTE for class balancing and hyperparameter tuning.
  CatBoost handles categorical features natively.

  Parameters:
    X_train (pd.DataFrame): Training features.
    y_train (pd.Series or np.array): Training target values.
    X_test (pd.DataFrame): Test features.
    rare_classes (list): List of integer-encoded classes to be oversampled with SMOTE.
    categorical_features (list): List of column names or indices for categorical features.

  Returns:
    best_estimator: The best estimator from RandomizedSearchCV.
    test_predictions: The predicted labels for X_test from the best estimator.
  """

  # SMOTE for class imbalance (only applied to rare classes)
  smote = SMOTE(sampling_strategy={cls: y_train.value_counts().max() for cls in rare_classes}, random_state=42)

  # CatBoost classifier
  catboost_clf = CatBoostClassifier(
    verbose=0,
    cat_features=categorical_features,
    random_state=42
  )

  # Define pipeline
  pipeline = Pipeline([
    ('smote', smote),
    ('clf', catboost_clf)
  ])

  # Define hyperparameter search space
  param_dist = {
    'clf__iterations': [100, 300, 500],
    'clf__depth': [4, 6, 8, 10],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__l2_leaf_reg': [1, 3, 5, 7]
  }

  # Randomized search
  search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    verbose=1,
    random_state=42,
    n_jobs=-1
  )

  # Fit model
  search.fit(X_train, y_train)

  # Predict on test set
  test_predictions = search.predict(X_test)

  return search.best_estimator_, test_predictions


In [None]:
X_train = df_train.drop(columns=['outcome_type'])
y_train = df_train['outcome_type']

X_test = df_test.drop(columns=['outcome_type'])
y_test = df_test['outcome_type']