# Tracking Changes in Your Pipeline

## Step 1: Data Preparation

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np

# Generate synthetic data for demonstration purposes
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors and create data loaders
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)


## Step 2: Model training with PyTorch

In [2]:
# Define a simple neural network for binary classification
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(10, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 2)  # Output layer for 2 classes

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the model, loss function, and optimizer
model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


Epoch 1/10, Loss: 0.6526679396629333
Epoch 2/10, Loss: 0.6071622371673584
Epoch 3/10, Loss: 0.5397849678993225
Epoch 4/10, Loss: 0.562313973903656
Epoch 5/10, Loss: 0.4002085030078888
Epoch 6/10, Loss: 0.41115787625312805
Epoch 7/10, Loss: 0.3581184446811676
Epoch 8/10, Loss: 0.3598690927028656
Epoch 9/10, Loss: 0.3118450343608856
Epoch 10/10, Loss: 0.5227107405662537


## Step 3: Drift detection setup with NannyML

In [3]:
import nannyml as nml
import pandas as pd


# For simplicity, we'll use the same dataset as reference and target here.
# In a real-world scenario, target_data should be your new, incoming data.
reference_data = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])
target_data = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])

# Calculate drift using NannyML (for demonstration, we're using univariate drift calculation)
reference_data['timestamp'] = pd.to_datetime('2020-01-01')
target_data['timestamp'] = pd.to_datetime('2020-02-01')

drift_calculator = nml.UnivariateDriftCalculator(column_names=reference_data.columns[:-1], timestamp_column_name='timestamp')
drift_calculator.fit(reference_data)
drift_results = drift_calculator.calculate(target_data)

# Visualize drift results
drift_figures = drift_results.plot(kind='drift')
drift_figures.show()

## Step 4:  Model Registry with MLFlow  

In [4]:
import mlflow
import mlflow.pytorch

# Set MLFlow tracking URI and experiment name
mlflow.set_tracking_uri('http://127.0.0.1:5556')  # Adjust to your MLFlow server
mlflow.set_experiment('MLBook_Experiment2')

# Start an MLFlow run and log the model
with mlflow.start_run(run_name='simple_nn'):
    mlflow.pytorch.log_model(model, 'model')
    mlflow.log_param('epochs', num_epochs)
    mlflow.log_metric('final_loss', loss.item())


MlflowException: API request to http://127.0.0.1:5556/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5556): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=MLBook_Experiment2 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x3466a9570>: Failed to establish a new connection: [Errno 61] Connection refused'))

## Step 5: Fine-Tuning Pipeline

In [None]:
# Generate or load new data for fine-tuning
# Here, we'll simulate new data for simplicity
X_new, y_new = make_classification(n_samples=500, n_features=10, n_classes=2, random_state=42)
new_data = TensorDataset(torch.tensor(X_new, dtype=torch.float32), torch.tensor(y_new, dtype=torch.long))
new_data_loader = DataLoader(new_data, batch_size=64, shuffle=True)

# Adjust the optimizer for fine-tuning (e.g., lower learning rate)
fine_tune_optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Fine-tuning loop
fine_tune_epochs = 5
for epoch in range(fine_tune_epochs):
    for inputs, labels in new_data_loader:
        fine_tune_optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        fine_tune_optimizer.step()
    print(f'Fine-tuning Epoch {epoch+1}/{fine_tune_epochs}, Loss: {loss.item()}')


Fine-tuning Epoch 1/5, Loss: 1.0046367645263672
Fine-tuning Epoch 2/5, Loss: 0.8660054802894592
Fine-tuning Epoch 3/5, Loss: 0.8357763886451721
Fine-tuning Epoch 4/5, Loss: 0.8215194940567017
Fine-tuning Epoch 5/5, Loss: 1.1421597003936768


## Step 6: Evaluation and Model Registry

In [None]:
# Evaluate the fine-tuned model (simplified evaluation for demonstration)
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the fine-tuned model on the test data: {100 * correct / total}%')

# Log the fine-tuned model in MLFlow
with mlflow.start_run(run_name='simple_nn_fine_tuned'):
    mlflow.pytorch.log_model(model, 'model_fine_tuned')
    mlflow.log_param('fine_tune_epochs', fine_tune_epochs)
    mlflow.log_metric('fine_tune_final_loss', loss.item())
    mlflow.log_metric('fine_tune_accuracy', 100 * correct / total)


Accuracy of the fine-tuned model on the test data: 83.5%
