In [1]:
import os
import pickle
import pandas as pd
import wandb
import wandb.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:
def load_and_preprocess_data():
    data = 'CaseStudy_training_data.xlsx'
    df = pd.read_excel(data, sheet_name='Model_data')
    
    # Handle missing values and duplicates
    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates()
    df_cleaned = df_cleaned.drop(['ID'], axis=1)
    df_cleaned = df_cleaned.rename(columns={'rx ds': 'rx_ds'})
    
    # Feature Engineering
    df_features = df_cleaned.copy()
    df_features['rx_ds_bucket'] = pd.qcut(df_cleaned['rx_ds'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
    
    binary_cols = [col for col in df_features.columns if col not in ['OD', 'rx_ds', 'rx_ds_bucket']]
    df_features['binary_sum'] = df_features[binary_cols].sum(axis=1)
    df_features['rx_ds_to_binary_sum'] = df_features['rx_ds'] / df_features['binary_sum']
    
    # One-Hot Encoding
    df_one_hot = pd.get_dummies(df_features['rx_ds_bucket'], prefix='rx_ds_bucket')
    df_features = pd.concat([df_features, df_one_hot], axis=1)
    df_features.drop(['rx_ds_bucket'], axis=1, inplace=True)
    
    # Splitting the data
    X = df_features.drop('OD', axis=1)
    y = df_features['OD']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

def train_logistic_regression(X_train, y_train, X_test, hyperparameters):
    model = LogisticRegression(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, y_pred

def calculate_performance_metrics(y_test, y_pred, model, X_test):
    metrics = {}
    metrics["accuracy"] = accuracy_score(y_test, y_pred)
    metrics["precision"] = precision_score(y_test, y_pred)
    metrics["recall"] = recall_score(y_test, y_pred)
    metrics["f1"] = f1_score(y_test, y_pred)
    metrics["roc_auc"] = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = conf_matrix.ravel()
    
    metrics["ppv"] = TP / (TP + FP)
    metrics["npv"] = TN / (TN + FN)
    metrics["specificity"] = TN / (TN + FP)
    
    return metrics

def log_to_wandb(metrics, model, X_train, X_test, y_train, y_test):
    # Log metrics
    wandb.log(metrics)
    
    # Save and log the model
    os.makedirs('models', exist_ok=True)
    model_path = "models/log_model.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    
    artifact = wandb.Artifact("log_model", type="model")
    artifact.add_file(model_path)
    wandb.log_artifact(artifact)
    
    # Save and log the data
    os.makedirs('data', exist_ok=True)
    datasets = {"training_data": X_train, "training_labels": y_train, "test_data": X_test, "test_labels": y_test}
    
    for name, df in datasets.items():
        df.to_csv(f'data/{name}.csv', index=False)
    
    artifact = wandb.Artifact('train_val_sets', type='dataset', metadata={"Source": "CaseStudy_training_data.xlsx"})
    artifact.add_dir('data')
    wandb.log_artifact(artifact)
    
    # Plot plots to Weights & Biases
    label_names = ["Not-OD", "OD"]
    y_pred_proba = model.predict_proba(X_test)
    
    wandb.sklearn.plot_class_proportions(y_train, y_test, label_names)
    wandb.sklearn.plot_summary_metrics(model, X_train, y_train, X_test, y_test)
    wandb.sklearn.plot_roc(y_test, y_pred_proba, labels=label_names)
    wandb.sklearn.plot_precision_recall(y_test, y_pred_proba, labels=label_names)
    wandb.sklearn.plot_confusion_matrix(y_test, y_pred_proba.argmax(axis=1), labels=label_names)


In [3]:
# Random Search Hyperparameters
sweep_config = {
    'method': 'random',
    'metric': {
      'name': 'recall', # 
      'goal': 'maximize'
    },
    'parameters': {
        'C': {
            'values': [0.1, 1, 10]
        },
        'max_iter': {
            'values': [100, 200, 300]
        },
        'penalty': {
            'values': ['l1', 'l2']
        },
        'solver': {
            'values': ['liblinear', 'saga']
        },
        'class_weight': {
            'values': ['balanced', None]
        }
    }
}


In [6]:
# Grid Search Hyperparameters
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'recall',
      'goal': 'maximize'
    },
    'parameters': {
        'C': {
            'values': [0.1, 1, 10]
        },
        'max_iter': {
            'values': [100, 200, 300]
        },
        'penalty': {
            'values': ['l1', 'l2']
        },
        'solver': {
            'values': ['liblinear', 'saga']
        },
        'class_weight': {
            'values': ['balanced', None]
        }
    }
}


In [8]:
# Bayesian Search Hyperparameters
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'recall',
      'goal': 'maximize'
    },
    'parameters': {
        'C': {
            'distribution': 'uniform',
            'min': 0.1,
            'max': 10
        },
        'max_iter': {
            'distribution': 'int_uniform',
            'min': 100,
            'max': 300
        },
        'penalty': {
            'values': ['l1', 'l2']
        },
        'solver': {
            'values': ['liblinear', 'saga']
        },
        'class_weight': {
            'values': ['balanced', None]
        }
    }
}


In [4]:
def sweep():
    # Initialize wandb
    run = wandb.init()
    
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Get hyperparameters from wandb
    hyperparameters = run.config
    
    # Train the logistic regression model
    model, y_pred = train_logistic_regression(X_train, y_train, X_test, hyperparameters)
    
    # Calculate performance metrics
    metrics = calculate_performance_metrics(y_test, y_pred, model, X_test)
    
    # Log to wandb
    log_to_wandb(metrics, model, X_train, X_test, y_train, y_test)
    
    # Finish the wandb run
    run.finish()



In [9]:
os.environ['WANDB_NOTEBOOK_NAME'] = '03_wandb_log_sweep.ipynb' # Failed to detect the name of this notebook...

sweep_id = wandb.sweep(sweep=sweep_config, project='wandb-sweep')
wandb.agent(sweep_id, function=sweep, count=1)  # Adjust 'count' as needed


Create sweep with ID: aao41419
Sweep URL: https://wandb.ai/dev_ml_ops/wandb-sweep/sweeps/aao41419


[34m[1mwandb[0m: Agent Starting Run: bxfijlij with config:
[34m[1mwandb[0m: 	C: 5.56533472706193
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 225
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: saga


[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.525
f1,0.44444
npv,0.76136
ppv,0.33929
precision,0.33929
recall,0.64407
roc_auc,0.59959
specificity,0.47518
