In [1]:
# Base on wandb documentation: Scikit-learn integration
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

from sklearn.exceptions import ConvergenceWarning
import warnings
import pickle
import wandb
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, roc_curve, mean_squared_error

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
def load_and_preprocess_data():
    data = 'CaseStudy_training_data.xlsx'
    df = pd.read_excel(data, sheet_name='Model_data')
    
    # Handle missing values and duplicates
    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates()
    df_cleaned = df_cleaned.drop(['ID'], axis=1)
    df_cleaned = df_cleaned.rename(columns={'rx ds': 'rx_ds'})
    
    # Feature Engineering
    df_features = df_cleaned.copy()
    df_features['rx_ds_bucket'] = pd.qcut(df_cleaned['rx_ds'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
    
    # One-Hot Encoding
    df_one_hot = pd.get_dummies(df_features['rx_ds_bucket'], prefix='rx_ds_bucket')
    df_features = pd.concat([df_features, df_one_hot], axis=1)
    df_features.drop(['rx_ds_bucket'], axis=1, inplace=True)
    
    # Splitting the data
    X = df_features.drop(['OD', 'rx_ds'], axis=1)
    y = df_features['OD']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

def train_decision_tree(X_train, y_train, X_test, hyperparameters):
    model = DecisionTreeClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, y_pred

def calculate_performance_metrics(y_test, y_pred, model, X_test):
    metrics = {}
    metrics["accuracy"] = accuracy_score(y_test, y_pred)
    metrics["precision"] = precision_score(y_test, y_pred)
    metrics["recall"] = recall_score(y_test, y_pred)
    metrics["f1"] = f1_score(y_test, y_pred)
    metrics["roc_auc"] = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = conf_matrix.ravel()
    
    metrics["ppv"] = TP / (TP + FP)
    metrics["npv"] = TN / (TN + FN)
    metrics["specificity"] = TN / (TN + FP)
    
    return metrics

def log_to_wandb(metrics, model, X_train, X_test, y_train, y_test):
    # Log metrics
    wandb.log(metrics)
    
    # Save and log the model
    os.makedirs('models', exist_ok=True)
    model_path = "models/tree_model.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    
    artifact = wandb.Artifact("tree_model", type="model")
    artifact.add_file(model_path)
    wandb.log_artifact(artifact)
    
    # Save and log the data
    os.makedirs('data', exist_ok=True)
    datasets = {"training_data": X_train, "training_labels": y_train, "test_data": X_test, "test_labels": y_test}
    
    for name, df in datasets.items():
        df.to_csv(f'data/{name}.csv', index=False)
    
    artifact = wandb.Artifact('train_val_sets', type='dataset', metadata={"Source": "CaseStudy_training_data.xlsx"})
    artifact.add_dir('data')
    wandb.log_artifact(artifact)
    
    # Plot plots to Weights & Biases
    label_names = ["Not-OD", "OD"]
    y_pred_proba = model.predict_proba(X_test)
    
    wandb.sklearn.plot_class_proportions(y_train, y_test, label_names)
    wandb.sklearn.plot_summary_metrics(model, X_train, y_train, X_test, y_test)
    wandb.sklearn.plot_roc(y_test, y_pred_proba, labels=label_names)
    wandb.sklearn.plot_precision_recall(y_test, y_pred_proba, labels=label_names)
    wandb.sklearn.plot_confusion_matrix(y_test, y_pred_proba.argmax(axis=1), labels=label_names)


In [3]:
# Random Search Hyperparameters for Decision Tree
sweep_config = {
    'method': 'random',
    'metric': {
      'name': 'recall',  # you might want to adjust this based on your specific use case
      'goal': 'maximize'
    },
    'parameters': {
        'criterion': {
            'values': ['gini', 'entropy']
        },
        'splitter': {
            'values': ['best', 'random']
        },
        'max_depth': {
            'values': [None, 10, 20, 30, 40, 50]  # you might want to adjust these values
        },
        'min_samples_split': {
            'values': [2, 5, 10]  # you might want to adjust these values
        },
        'min_samples_leaf': {
            'values': [1, 2, 4]  # you might want to adjust these values
        },
        'max_features': {
            'values': [None, 'sqrt', 'log2']  # you might want to adjust these values
        },
        'max_leaf_nodes': {
            'values': [None, 10, 20, 30, 40, 50]  # you might want to adjust these values
        },
        'min_impurity_decrease': {
            'values': [0.0, 0.1, 0.2]  # you might want to adjust these values
        },
        'min_weight_fraction_leaf': {
            'values': [0.0, 0.1, 0.2]  # you might want to adjust these values
        },
        'class_weight': {
            'values': ['balanced', None]
        },
        'ccp_alpha': {
            'values': [0.0, 0.1, 0.2]  # you might want to adjust these values
        }
    }
}


In [6]:
def sweep():
    # Initialize wandb
    run = wandb.init()
    
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Get hyperparameters from wandb
    hyperparameters = run.config
    
    # Train the logistic regression model
    model, y_pred = train_decision_tree(
        X_train, y_train, X_test, hyperparameters)
    
    # Calculate performance metrics
    metrics = calculate_performance_metrics(y_test, y_pred, model, X_test)
    
    # Log to wandb
    log_to_wandb(metrics, model, X_train, X_test, y_train, y_test)
    
    # Finish the wandb run
    run.finish()


In [7]:
os.environ['WANDB_NOTEBOOK_NAME'] = '06_tree_sweep.ipynb' # Failed to detect the name of this notebook...

sweep_id = wandb.sweep(sweep=sweep_config, project='tree-sweep')
wandb.agent(sweep_id, function=sweep, count=5)  # Adjust 'count' as needed


Create sweep with ID: tknx65iv
Sweep URL: https://wandb.ai/idiazl/tree-sweep/sweeps/tknx65iv


[34m[1mwandb[0m: Agent Starting Run: oe3rsuwh with config:
[34m[1mwandb[0m: 	ccp_alpha: 0.1
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	criterion: entropy
[34m[1mwandb[0m: 	max_depth: 40
[34m[1mwandb[0m: 	max_features: None
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_impurity_decrease: 0
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 5
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0.2
[34m[1mwandb[0m: 	splitter: random


  _warn_prf(average, modifier, msg_start, len(result))
  metrics["ppv"] = TP / (TP + FP)
[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.705
f1,0.0
npv,0.705
ppv,
precision,0.0
recall,0.0
roc_auc,0.71589
specificity,1.0


[34m[1mwandb[0m: Agent Starting Run: mggmjjz0 with config:
[34m[1mwandb[0m: 	ccp_alpha: 0
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	criterion: entropy
[34m[1mwandb[0m: 	max_depth: 20
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	max_leaf_nodes: 30
[34m[1mwandb[0m: 	min_impurity_decrease: 0
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 5
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0.2
[34m[1mwandb[0m: 	splitter: random


  _warn_prf(average, modifier, msg_start, len(result))
  metrics["ppv"] = TP / (TP + FP)
[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.705
f1,0.0
npv,0.705
ppv,
precision,0.0
recall,0.0
roc_auc,0.54934
specificity,1.0


[34m[1mwandb[0m: Agent Starting Run: ibxx6hzv with config:
[34m[1mwandb[0m: 	ccp_alpha: 0.2
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	max_depth: 50
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	max_leaf_nodes: 40
[34m[1mwandb[0m: 	min_impurity_decrease: 0
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 10
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0.1
[34m[1mwandb[0m: 	splitter: random


  _warn_prf(average, modifier, msg_start, len(result))
  metrics["ppv"] = TP / (TP + FP)
[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.705
f1,0.0
npv,0.705
ppv,
precision,0.0
recall,0.0
roc_auc,0.5
specificity,1.0


[34m[1mwandb[0m: Agent Starting Run: 6oo7n81o with config:
[34m[1mwandb[0m: 	ccp_alpha: 0.1
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	max_depth: None
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 10
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0.1
[34m[1mwandb[0m: 	splitter: best


  _warn_prf(average, modifier, msg_start, len(result))
  metrics["ppv"] = TP / (TP + FP)
[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.705
f1,0.0
npv,0.705
ppv,
precision,0.0
recall,0.0
roc_auc,0.5
specificity,1.0


[34m[1mwandb[0m: Agent Starting Run: qwksee6j with config:
[34m[1mwandb[0m: 	ccp_alpha: 0.1
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	criterion: entropy
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_impurity_decrease: 0
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 10
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	splitter: random


  _warn_prf(average, modifier, msg_start, len(result))
  metrics["ppv"] = TP / (TP + FP)
[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.705
f1,0.0
npv,0.705
ppv,
precision,0.0
recall,0.0
roc_auc,0.5
specificity,1.0
