In [1]:
# Base on wandb documentation: Scikit-learn integration
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn import datasets, cluster
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning
import warnings
import pickle
import wandb

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, roc_curve, mean_squared_error

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# import weave

In [2]:
# Failed to detect the name of this notebook...
os.environ['WANDB_NOTEBOOK_NAME'] = '02_opiod_wandb.ipynb'

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33midiazl[0m ([33mdev_ml_ops[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
data = 'CaseStudy_training_data.xlsx'
df = pd.read_excel(data, sheet_name='Model_data')

3. Data cleaning

In [5]:
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned = df_cleaned.drop(['ID'], axis=1)
df_cleaned = df_cleaned.rename(columns={'rx ds': 'rx_ds'})

4. Feature Engineering

In [6]:
# Perform percentile-based bucketing for 'rx_ds'
df_features = df_cleaned.copy()
df_features['rx_ds_bucket'] = pd.qcut(
    df_cleaned['rx_ds'], 
    q=4, 
    labels=['Q1', 'Q2', 'Q3', 'Q4']
    )

In [7]:
# Create a new feature that is the sum of all the binary features
binary_cols = [col for col in df_features.columns if col not in ['OD', 'rx_ds', 'rx_ds_bucket']]
df_features['binary_sum'] = df_features[binary_cols].sum(axis=1)

# Create a new feature that is the ratio of 'rx_ds' to the sum of binary features
df_features['rx_ds_to_binary_sum'] = df_features['rx_ds'] / df_features['binary_sum']

In [8]:
# Perform one-hot encoding for 'rx_ds_bucket'
df_one_hot = pd.get_dummies(df_features['rx_ds_bucket'], prefix='rx_ds_bucket')
df_features = pd.concat([df_features, df_one_hot], axis=1)
df_features.drop(['rx_ds_bucket'], axis=1, inplace=True)

## Runs for model training

### 1. Classification - Logistic Regression

In [9]:
from sklearn.model_selection import train_test_split

df_log = df_features.copy()

X = df_log.drop(['OD', 'rx_ds'], axis=1)
y = df_log['OD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Defining a function to calculate the metrics for the classifier
def calculate_metrics(y_test, y_pred, model, X_test):
    metrics = {}
    metrics["accuracy"] = accuracy_score(y_test, y_pred)
    metrics["precision"] = precision_score(y_test, y_pred)
    metrics["recall"] = recall_score(y_test, y_pred)
    metrics["f1"] = f1_score(y_test, y_pred)
    metrics["roc_auc"] = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = conf_matrix.ravel()
    
    metrics["ppv"] = TP / (TP + FP)
    metrics["npv"] = TN / (TN + FN)
    metrics["specificity"] = TN / (TN + FP)
    
    return metrics

In [21]:
# New wandb project and run
run = wandb.init(project='wandb-sklearn', name="classifier_firday_nght")

ridge_params = {
    'penalty': 'l1',
    'solver': 'liblinear',
    'C': 5,
    'class_weight': 'balanced',
    'max_iter': 5000,
    'random_state': 42
}

wandb.config = ridge_params

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.217764…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112398822236476, max=1.0…

In [22]:
# Initializing the model, Fitting and predicting
log_model = LogisticRegression(**ridge_params)
log_model.fit(X_train, y_train)
y_pred_logistic = log_model.predict(X_test)

### Logging metrics

In [23]:
# Logging the performance metrics
y_pred = log_model.predict(X_test)
log_metrics = calculate_metrics(y_test, y_pred, log_model, X_test)

wandb.log(log_metrics)

#### Logging artifacts

In [24]:
# Save the model
os.makedirs('models', exist_ok=True)
with open("models/log_model.pkl", "wb") as f:
    pickle.dump(log_model, f)

# Log the model as a versioned file
artifact = wandb.Artifact("log_mode", type="model")
artifact.add_file("models/log_model.pkl")
wandb.log_artifact(artifact)

<Artifact log_mode>

In [17]:
# Save the data
os.makedirs('data', exist_ok=True)
datasets = {"trainig": X, "validation": y}

for name, df in datasets.items():
    df.to_csv(f'data/{name}.csv', index=False)

# Log the `data` as an artifact
artifact = wandb.Artifact('train_val_sets', type='dataset', metadata={"Source": "CaseStudy_training_data.xlsx"})
artifact.add_dir('data')
wandb.log_artifact(artifact)

[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


<Artifact train_val_sets>

In [25]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.715
f1,0.6069
npv,0.86842
ppv,0.51163
precision,0.51163
recall,0.74576
roc_auc,0.78002
specificity,0.70213


### 1. Regression model

In [26]:
df_model_linear = df_features.copy()

# Using rx_ds as target (for regression)
X = df_model_linear.drop(['rx_ds'], axis=1)
y = df_model_linear['rx_ds']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
# New wandb project and run
run = wandb.init(project='wandb-sklearn', name="regression")

In [29]:
# Train model, get predictions - Regression
reg = Ridge(1.0)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

### Residual Plot

Measures and plots the predicted target values (y-axis) vs the difference between actual and predicted target values (x-axis), as well as the distribution of the residual error.

Generally, the residuals of a well-fit model should be randomly distributed because good models will account for most phenomena in a data set, except for random error.

In [30]:
wandb.sklearn.plot_residuals(reg, X_train, y_train)



### Outlier Candidate

Measures a datapoint's influence on regression model via Cook's distance. Instances with heavily skewed influences could potentially be outliers. Useful for outlier detection.

In [31]:
wandb.sklearn.plot_outlier_candidates(reg, X_train, y_train)



### All-in-one: Regression plot
Using this all in one API one can:
* Log summary of metrics
* Log learning curve
* Log outlier candidates
* Log residual plot

In [32]:
wandb.sklearn.plot_regressor(reg, X_train, X_test, y_train, y_test, model_name='Ridge')

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Ridge.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


### 2. Classifier RandomForest

In [43]:
# Train model, get predictions - Classification
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [44]:
run = wandb.init(project='wandb-sklearn', name="classification")

### Class Proportions

Plots the distribution of target classes in training and test sets. Useful for detecting imbalanced classes and ensuring that one class doesn't have a disproportionate influence on the model.

In [20]:
from sklearn.utils.class_weight import compute_class_weight

labels = y_train.unique() # Binary labels for plot
wandb.sklearn.plot_class_proportions(y_train, y_test, labels)

### Learning Curve

Trains model on datasets of varying lengths and generates a plot of cross validated scores vs dataset size, for both training and test sets.

In [47]:
wandb.sklearn.plot_learning_curve(model, X_train, y_train)

### ROC

ROC curves plot true positive rate (y-axis) vs false positive rate (x-axis). The ideal score is a `TPR = 1` and `FPR = 0`, which is the point on the top left. Typically we calculate the area under the ROC curve (AUC-ROC), and the greater the AUC-ROC the better

In [48]:
wandb.sklearn.plot_roc(y_test, y_probas, labels)



### Precision Recall Curve

Computes the tradeoff between precision and recall for different thresholds. A high area under the curve represents both high recall and high precision, where high precision relates to a low false positive rate, and high recall relates to a low false negative rate.

In [49]:
wandb.sklearn.plot_precision_recall(y_test, y_probas, labels)

### Feature Importances

Evaluates and plots the importance of each feature for the classification task. Only works with classifiers that have a `feature_importances_` attribute, like trees.

In [51]:
wandb.sklearn.plot_feature_importances(model)

X does not have valid feature names, but RandomForestClassifier was fitted with feature names


### All-in-one: Classifier Plot

Using this all in one API one can:
* Log feature importance
* Log learning curve
* Log confusion matrix
* Log summary metrics
* Log class proportions
* Log calibration curve
* Log roc curve
* Log precision recall curve

In [52]:
wandb.sklearn.plot_classifier(model, 
                              X_train, X_test, 
                              y_train, y_test, 
                              y_pred, y_probas, 
                              labels, 
                              is_binary=True, 
                              model_name='RandomForest')

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RandomForest.
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.




## 3. Clustering

In [15]:
# Train model, get predictions - Clustering
kmeans = KMeans(n_clusters=3, random_state=1)
cluster_labels = kmeans.fit_predict(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [16]:
run = wandb.init(project='wandb-sklearn', name="clustering")

### Elbow Plot

Measures and plots the percentage of variance explained as a function of the number of clusters, along with training times. Useful in picking the optimal number of clusters.

In [17]:
wandb.sklearn.plot_elbow_curve(kmeans, X)



### Silhouette Plot

Measures & plots how close each point in one cluster is to points in the neighboring clusters. The thickness of the clusters corresponds to the cluster size. The vertical line represents the average silhouette score of all the points.

In [24]:
# Define the names corresponding to each class in the "OD" column
names = ["not_addict", "addict"]

def get_label_descriptions(classes):
    """Convert binary target values to descriptive labels"""
    return np.array([names[aclass] for aclass in classes])

# Extract descriptive labels for the "OD" column
classes = get_label_descriptions(y)

In [25]:
wandb.sklearn.plot_silhouette(kmeans, X, classes)



## All in one: Clusterer Plot

Using this all-in-one API you can:
* Log elbow curve
* Log silhouette plot

In [26]:
wandb.sklearn.plot_clusterer(kmeans, X, cluster_labels, classes, 'KMeans')

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting KMeans.
[34m[1mwandb[0m: Logged elbow curve.
[34m[1mwandb[0m: Logged silhouette plot.


