<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/deep_learning/MLFlow_pytorch_flavor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#MLFLOW
https://mlflow.org/docs/latest/introduction/index.html


MLflow is a solution to many of these issues in this dynamic landscape, offering tools and simplifying processes to streamline the ML lifecycle and foster collaboration among ML practitioners.



# MLflow Pytorch Guide
https://mlflow.org/docs/latest/deep-learning/pytorch/index.html

# ngrok
Connect localhost to the internet for testing applications and APIs
Bring secure connectivity to apps and APIs in localhost and dev/test environments with just one command or function call.
- Webhook testing
- Developer Previews
- Mobile backend testing

https://ngrok.com/


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install  torchmetrics torchinfo --quiet

In [3]:
!pip install mlflow pyngrok evaluate  bitsandbytes accelerate datasets transformers==4.39.3 --quiet
get_ipython().system_raw("mlflow ui --port 5000 &")

In [4]:

from pyngrok import ngrok
from getpass import getpass

# Terminate open tunnels if exist
ngrok.kill()

In [5]:
from google.colab import userdata
NGROK_AUTH_TOKEN  = userdata.get('NGROK')

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://4eb0-34-124-229-114.ngrok-free.app


In [6]:
# Disable tokenizers warnings when constructing pipelines
%env TOKENIZERS_PARALLELISM=false

import warnings

# Disable a few less-than-useful UserWarnings from setuptools and pydantic
warnings.filterwarnings("ignore", category=UserWarning)

env: TOKENIZERS_PARALLELISM=false


In [7]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchinfo import summary
from torchmetrics import Accuracy
from torchvision import datasets
from torchvision.transforms import ToTensor

import mlflow


# Prepare the Data

In [8]:
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [9]:
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

## Define  Model

In [10]:
class ImageClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3),
            nn.ReLU(),
            nn.Flatten(),
            nn.LazyLinear(10),  # 10 classes in total.
        )

    def forward(self, x):
        return self.model(x)

In [11]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [12]:
mlflow.set_experiment("mlflow-pytorch")

<Experiment: artifact_location='mlflow-artifacts:/507816787851751421', creation_time=1715711920065, experiment_id='507816787851751421', last_update_time=1715711920065, lifecycle_stage='active', name='mlflow-pytorch', tags={}>

In [13]:
# Get cpu or gpu for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [14]:
def train(dataloader, model, loss_fn, metrics_fn, optimizer, epoch):
    """Train the model on a single pass of the dataloader.

    Args:
        dataloader: an instance of `torch.utils.data.DataLoader`, containing the training data.
        model: an instance of `torch.nn.Module`, the model to be trained.
        loss_fn: a callable, the loss function.
        metrics_fn: a callable, the metrics function.
        optimizer: an instance of `torch.optim.Optimizer`, the optimizer used for training.
        epoch: an integer, the current epoch number.
    """
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)
        accuracy = metrics_fn(pred, y)

        # Backpropagation.
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch
            step = batch // 100 * (epoch + 1)
            mlflow.log_metric("loss", f"{loss:2f}", step=step)
            mlflow.log_metric("accuracy", f"{accuracy:2f}", step=step)
            print(f"loss: {loss:2f} accuracy: {accuracy:2f} [{current} / {len(dataloader)}]")

In [15]:
def evaluate(dataloader, model, loss_fn, metrics_fn, epoch):
    """Evaluate the model on a single pass of the dataloader.

    Args:
        dataloader: an instance of `torch.utils.data.DataLoader`, containing the eval data.
        model: an instance of `torch.nn.Module`, the model to be trained.
        loss_fn: a callable, the loss function.
        metrics_fn: a callable, the metrics function.
        epoch: an integer, the current epoch number.
    """
    num_batches = len(dataloader)
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            eval_loss += loss_fn(pred, y).item()
            eval_accuracy += metrics_fn(pred, y)

    eval_loss /= num_batches
    eval_accuracy /= num_batches
    mlflow.log_metric("eval_loss", f"{eval_loss:2f}", step=epoch)
    mlflow.log_metric("eval_accuracy", f"{eval_accuracy:2f}", step=epoch)

    print(f"Eval metrics: \nAccuracy: {eval_accuracy:.2f}, Avg loss: {eval_loss:2f} \n")

In [16]:
# Training Parameters
epochs = 3
loss_fn = nn.CrossEntropyLoss()
metric_fn = Accuracy(task="multiclass", num_classes=10).to(device)
model = ImageClassifier().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [17]:
from datetime import datetime

name = "pytorch_" +datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
with mlflow.start_run(run_name = name) as run:
    params = {
        "epochs": epochs,
        "learning_rate": 1e-3,
        "batch_size": 64,
        "loss_function": loss_fn.__class__.__name__,
        "metric_function": metric_fn.__class__.__name__,
        "optimizer": "SGD",
    }
    # Log training parameters.
    mlflow.log_params(params)

    # Log model summary.
    with open("model_summary.txt", "w") as f:
        f.write(str(summary(model)))
    mlflow.log_artifact("model_summary.txt")

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, metric_fn, optimizer, epoch=t)
        evaluate(test_dataloader, model, loss_fn, metric_fn, epoch=0)

    # Save the trained model to MLflow.
    mlflow.pytorch.log_model(model, "model")

Epoch 1
-------------------------------
loss: 2.297867 accuracy: 0.109375 [0 / 938]
loss: 2.025223 accuracy: 0.578125 [100 / 938]
loss: 1.514050 accuracy: 0.593750 [200 / 938]
loss: 1.245676 accuracy: 0.640625 [300 / 938]
loss: 0.898259 accuracy: 0.687500 [400 / 938]
loss: 0.853767 accuracy: 0.734375 [500 / 938]
loss: 0.854323 accuracy: 0.703125 [600 / 938]
loss: 0.702305 accuracy: 0.750000 [700 / 938]
loss: 0.754247 accuracy: 0.734375 [800 / 938]
loss: 0.755127 accuracy: 0.765625 [900 / 938]
Eval metrics: 
Accuracy: 0.75, Avg loss: 0.695753 

Epoch 2
-------------------------------
loss: 0.620304 accuracy: 0.796875 [0 / 938]
loss: 0.733818 accuracy: 0.718750 [100 / 938]
loss: 0.475674 accuracy: 0.828125 [200 / 938]
loss: 0.751344 accuracy: 0.734375 [300 / 938]
loss: 0.684074 accuracy: 0.625000 [400 / 938]
loss: 0.652751 accuracy: 0.750000 [500 / 938]
loss: 0.702949 accuracy: 0.703125 [600 / 938]
loss: 0.630992 accuracy: 0.750000 [700 / 938]
loss: 0.685438 accuracy: 0.734375 [800 / 938



In [19]:
logged_model = f"runs:/{run.info.run_id}/model"
loaded_model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/14 20:00:30 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [23]:
pytorch_loaded =  mlflow.pytorch.load_model(logged_model)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/14 20:33:56 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [25]:
type(pytorch_loaded)

In [20]:
outputs = loaded_model.predict(training_data[0][0][None, :].numpy())
outputs

array([[-5.479626 , -6.9544983, -3.7984595, -3.887539 , -1.5908383,
         4.6727657, -1.4280988,  5.6618443,  4.0271525, 10.21223  ]],
      dtype=float32)

In [21]:
import numpy as np
np.argmax(outputs, axis=1)

array([9])

In [22]:
training_data[0][1]

9

In [46]:
ngrok.kill()