## Helper Functions

In [16]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def get_Xy(df):
   return df.drop('popularity', axis=1), df['popularity']

def get_spotify(Xy=True, size=2000):
    df = pd.read_csv("data/spotify.csv")
    df = df[['popularity', 'danceability', 'energy', 'loudness', 'tempo', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']]
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.sample(n=size).reset_index(drop=True)
    if Xy:
        return get_Xy(df)
    return df

def get_random_example(label_name='popularity', as_pandas=False):
    '''
    If not as_pandas:
        Returns list(feature_names), list(feature_values), label_name, label_value
    If as_pandas:
        Returns features dataframe, label series
    '''
    raw_data = get_spotify(Xy=False)
    sample = raw_data.sample(n=1)
    if not as_pandas:
        features = sample.drop(label_name, axis=1)
        label = sample[label_name]
        return list(features.columns), list(features.iloc[0]), label_name, float(label.values[0])
    return sample.drop('popularity', axis=1), label_name, sample['popularity'].iloc[0]

def get_data_loaders(X_train, X_test, y_train, y_test, batch_size=32):
  # Convert to Tensors
  X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
  X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
  y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

  # Create Datasets
  train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
  test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

  # Create DataLoaders
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  return train_loader, test_loader

def train_loop(train_loader, model, optimizer, criterion, device):
  model.train()
  epoch_train_loss = 0.0
  for batch, labels in train_loader:
    batch, labels = batch.to(device), labels.to(device)
    optimizer.zero_grad()
    # Forward Pass
    outputs = model(batch)
    loss = criterion(outputs, labels)
    # Update Parameters
    loss.backward()
    optimizer.step()
    # Save loss
    epoch_train_loss += loss.item() * batch.size(0)
  return epoch_train_loss / len(train_loader.dataset)

def test_loop(test_loader, model, criterion, device):
  model.eval()
  epoch_test_loss = 0.0
  with torch.no_grad():
    for batch, labels in test_loader:
      batch, labels = batch.to(device), labels.to(device)
      # Calculate predictions
      outputs = model(batch)
      # Save loss
      loss = criterion(outputs, labels)
      epoch_test_loss += loss.item() * batch.size(0)
  return epoch_test_loss / len(test_loader.dataset)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## MLFlow Setup

**Option 1**  
`$ mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 127.0.0.1 --port 5000`

**Option 2**  
`$ mlflow ui`

In [14]:
!pip install mlflow



## Simple Sklearn Experiment

In [17]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [35]:
mlflow.set_experiment("Simple-Sklearn-Experiment")
with mlflow.start_run():
    # Get Data
    data = get_spotify(Xy=False)
    X,y = get_Xy(data)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Model
    n_estimators = 100
    max_depth = 20
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=20, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Log Parameters
    mlflow.log_param("model_type", "Random Forest Regressor")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Log Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    # Log Model
    mlflow.sklearn.log_model(
        sk_model=model, 
        artifact_path="spotify_model",
        input_example=X_train,
    )
    
    # Log Dataset
    data.to_csv("data/data.csv", index=False)
    mlflow.log_artifact("data/data.csv", artifact_path="datasets")

2024/09/03 21:27:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run enchanting-gnat-619 at: http://127.0.0.1:5000/#/experiments/1/runs/afbd09651ad843e896c43a57ddf53c03.
2024/09/03 21:27:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


## Simple PyTorch Experiment

In [34]:
mlflow.set_experiment("Simple-PyTorch-Experiment")
with mlflow.start_run():
  # Get Data
  X, y = get_spotify(size=20000)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  train_loader, test_loader = get_data_loaders(X_train, X_test, y_train, y_test)

  # Define Simple PyTorch Model
  model = nn.Sequential(
      nn.Linear(len(X_train.columns), 32),
      nn.ReLU(),
      nn.Dropout(0.5),
      nn.Linear(32, 32),
      nn.ReLU(),       
      nn.Linear(32, 1)
  ).to(device)

  # Setup Training
  num_epochs = 5
  learning_rate = 0.001
  criterion = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # Log Parameters
  mlflow.log_param("num_epochs", num_epochs)
  mlflow.log_param("learning_rate", learning_rate)
  mlflow.log_param("criterion", 'MSELoss')
  mlflow.log_param("optimizer", 'Adam')

  # Training/Validation Loop
  for epoch in range(num_epochs):
    train_loss = train_loop(train_loader, model, optimizer, criterion, device)
    test_loss = train_loop(test_loader, model, optimizer, criterion, device)

    # Log Metrics
    mlflow.log_metric("train_loss", train_loss, step=epoch)
    mlflow.log_metric("test_loss", test_loss, step=epoch)
    
    print(f"[Epoch {epoch+1}/{num_epochs}] TrainLoss: {train_loss:.2f}; TestLoss: {test_loss:.2f}")
  
  # Log model
  mlflow.pytorch.log_model(model, "model")

[Epoch 1/5] TrainLoss: 518.36; TestLoss: 472.60
[Epoch 2/5] TrainLoss: 475.30; TestLoss: 456.23
[Epoch 3/5] TrainLoss: 462.22; TestLoss: 444.70
[Epoch 4/5] TrainLoss: 453.72; TestLoss: 436.64
[Epoch 5/5] TrainLoss: 445.52; TestLoss: 424.86


2024/09/03 21:26:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run calm-auk-934 at: http://127.0.0.1:5000/#/experiments/2/runs/c0b376036438422987253c51b6e08d52.
2024/09/03 21:26:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.


## Register the Model
1. In the UI, find the run with the model
2. Click "Register Model"


## Predictions (Productionize)

### Simple Load a Model

In [30]:
model_name = "Spotify"
# model_uri = f"models:/{model_name}/1"
# model_uri = f"models:/{model_name}/latest"
model_uri = f"models:/{model_name}@champion"
# model_uri = f"runs:/{mlflow_run_id}/{run_relative_path_to_model}" # you can load from a run
model = mlflow.sklearn.load_model(model_uri)

x, label_name, y_true = get_random_example(as_pandas=True)
y_pred = model.predict(x)[0]
print(f"[{label_name}] y_true={y_true}; y_pred={y_pred:.2f}")

[popularity] y_true=15; y_pred=30.00


### Serve the Model Using MLFlow

`$ mlflow models serve -m mlartifacts/418269113200786049/0f2dbf69a8ab435093d6da9daab43109/artifacts/pokemon_hp_model --port 5001`

`mlflow models serve -m "models:/Spotify@champion" -p 5001 --host 127.0.0.1`

`export MLFLOW_TRACKING_URI=http://127.0.0.1:5000`

In [31]:
import requests
import json

def predict(feature_names, feature_values, base_url="http://127.0.0.1:5001"):
    url = f"{base_url}/invocations"
    data = {
        "dataframe_split": {
            "columns": feature_names,
            "data": [feature_values]
        }
    }
    response = requests.post(url, data=json.dumps(data), headers={"Content-Type": "application/json"})
    return response.json()['predictions'][0]

In [33]:
feature_names, feature_values, label_name, label_value = get_random_example()
prediction = predict(feature_names, feature_values)
print(f"[{label_name}] y_true={label_value}; y_pred={prediction:.2f}")

[popularity] y_true=20.0; y_pred=31.98


### Serve the Model with Docker

Make sure docker is running on your local computer.

`$ mlflow models build-docker -m models:/Spotify@champion -n my_ml_model`

`docker run -p 5002:8080 my_ml_model`

In [102]:
feature_names, feature_values, label_name, label_value = get_random_example()
prediction = predict(feature_names, feature_values, base_url="http://127.0.0.1:5002")
print(f"[{label_name}] y_true={label_value}; y_pred={prediction:.2f}")

[popularity] y_true=47.0; y_pred=29.39
