# MLOps Zoomcamp wandb workshop

In [9]:
import os, sys
from dotenv import load_dotenv
from pathlib import Path
import requests

In [12]:
from typing import List

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Define some paths
base_dir = Path(os.getcwd())
scripts_dir = base_dir / "homework"
data_dir = base_dir / "data"
output_dir = base_dir  / "output"

In [35]:
# Define some notebook variables
redownload_data = False

In [4]:
# loading wandb variables
load_dotenv()
WANDB_PROJECT_NAME = os.getenv("WANDB_PROJECT_NAME")
WANDB_USERNAME = os.getenv("WANDB_USERNAME")
print(f"WANDB_PROJECT_NAME: {WANDB_PROJECT_NAME}")
print(f"WANDB_USERNAME: {WANDB_USERNAME}")

WANDB_PROJECT_NAME: mlops-zoomcamp-workshop
WANDB_USERNAME: ochapeau


# Q1

In [5]:
!wandb --version

wandb, version 0.15.3


# Q2

## Downloading data

In [6]:
data_urls = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-03.parquet"
]

In [25]:
def download_data(data_urls: List[str], data_dir: Path):
    for url in data_urls:
        r = requests.get(url, stream=True)
        filename = data_dir / Path(url).name
        with open(filename, "wb") as file:
            for chunk in r.iter_content(chunk_size=1024):
                # writing one chunk at a time to the file
                if chunk:
                    file.write(chunk)
        print(f"{filename} downloaded!")

In [29]:
if data_dir.exists() and data_dir.is_file():
    print(f"Error: {data_dir} is a file!", file=sys.stderr)
elif not data_dir.is_dir():
    os.mkdir(data_dir)
    print(f"\"{data_dir}\" directory created!")
    download_data(data_urls, data_dir)
else:
    print(f"\"{data_dir}\" already exists!")

"/Users/olivier/Documents/courses/mlops-zoomcamp/wandb_workshop/data" already exists!


In [30]:
if redownload_data:
    download_data(data_urls, data_dir)

## Preprocessing the data

In [31]:
# Run the preprocess_data script
os.chdir(scripts_dir)
!python preprocess_data.py --wandb_project {WANDB_PROJECT_NAME} --wandb_entity {WANDB_USERNAME} --raw_data_path {data_dir} --dest_path {output_dir}
os.chdir(base_dir)

[34m[1mwandb[0m: Currently logged in as: [33mochapeau[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/olivier/Documents/courses/mlops-zoomcamp/wandb_workshop/homework/wandb/run-20230605_163816-55gqdnds[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mscarlet-disco-3[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop/runs/55gqdnds[0m
[34m[1mwandb[0m: Adding directory to artifact (/Users/olivier/Documents/courses/mlops-zoomcamp/wandb_workshop/output)... Done. 0.0s
[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 🚀 View run [33mscarlet-disco-3[0m at: [34m[4mhttps://wandb.ai/ochapeau/mlops-zoom

## Size of the DictVectorizer

Screenshot of the **Artifacts** -> **Files** from the wandb project
![files tab from artifacts](images/q2.png)
The size of the saved `DictVectorizer` file (`dv.pkl`) is 153.7 KB

# Q3

## Training the model

In [32]:
data_artifact = f"{WANDB_USERNAME}/{WANDB_PROJECT_NAME}/NYC-Taxi:v0"

In [33]:
# Run the train script
os.chdir(scripts_dir)
!python train.py --wandb_project {WANDB_PROJECT_NAME} --wandb_entity {WANDB_USERNAME} --data_artifact {data_artifact}
os.chdir(base_dir)

[34m[1mwandb[0m: Currently logged in as: [33mochapeau[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/olivier/Documents/courses/mlops-zoomcamp/wandb_workshop/homework/wandb/run-20230605_163947-idf0thr2[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33micy-thunder-4[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop/runs/idf0thr2[0m
[34m[1mwandb[0m:   4 of 4 files downloaded.  
[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 🚀 View run [33micy-thunder-4[0m at: [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop/runs/idf0thr2[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s),

## Value of the `max_depth` parameter

Screenshot of the **Overview** -> **Config** from the wandb run
![wandb run config](images/q3.png)
The value of the saved `max_depth` parameter is 10

# Q4

## Tuning the model hyperparameters

In [34]:
# Run the sweep script
os.chdir(scripts_dir)
!python sweep.py --wandb_project {WANDB_PROJECT_NAME} --wandb_entity {WANDB_USERNAME} --data_artifact {data_artifact}
os.chdir(base_dir)

Create sweep with ID: f1hcs5g6
Sweep URL: https://wandb.ai/ochapeau/mlops-zoomcamp-workshop/sweeps/f1hcs5g6
[34m[1mwandb[0m: Agent Starting Run: l20zrsn4 with config:
[34m[1mwandb[0m: 	max_depth: 14
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 3
[34m[1mwandb[0m: 	n_estimators: 45
[34m[1mwandb[0m: Currently logged in as: [33mochapeau[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/olivier/Documents/courses/mlops-zoomcamp/wandb_workshop/homework/wandb/run-20230605_170418-l20zrsn4[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdevoted-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/ochapeau/mlops-zoomcamp-workshop/sweeps/

## Looking at the sweep

Screenshot of the **Parameters importance panel wrt MSE** from the wandb sweep
![parameter importance wrt MSE from wandb sweep](images/q4.png)
The most important parameter is `max_depth`

# Q5

## Link the best model to the model registry

Below are the screenshots of the best model in the registry
![best model in registry version](images/q5_1.png)
![pbest model in registry metadata](images/q5_2.png)
We can see on them:
- Versioning: Version 0 (first screenshot)
- Metadata (second screenshot)
- Aliases: @latest, @v0 (first screenshot)
- Metric (MSE): ~2.4482 (second screenshot)
- Source run: in "Created By" devoted-sweep-1 (first screenshot)