# Exploration Notebook

Notebook used for exploration of tools

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 02/02/2026   | Martin | Created   | Notebook created to explore tools used for ML monitoring | 
| 03/02/2026   | Martin | Update   | Completed Evidently dashboards | 

# Content

* [Baseline Taxi Model w/ Evidently](#baseline-taxi-model-with-evidently)
* [Evidently Dashboard](#evidently-dashboard)
* [Testing for Grafana](#testing-for-grafana)

# Baseline Taxi Model with Evidently

`evidently` is a package that provides a report on model and data drift

In [1]:
import requests
import datetime
import pandas as pd

from joblib import load, dump
from tqdm import tqdm

from evidently import Dataset, DataDefinition, Report
from evidently.metrics import *
from evidently.presets import *

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [26]:
files = [
  ('green_tripdata_2025-01.parquet','../data/raw'),
  ('green_tripdata_2025-02.parquet', '../data/raw'),
  ('green_tripdata_2025-03.parquet', '../data/raw'),
]

print("Download files:")
for file, path in files:
  url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
  resp = requests.get(url, stream=True)
  save_path = f"{path}/{file}"
  with open(save_path, "wb") as handle:
    for data in tqdm(resp.iter_content(),
                    desc=f"{file}",
                    postfix=f"save to {save_path}",
                    total=int(resp.headers["Content-Length"])):
      handle.write(data)



Download files:


green_tripdata_2025-03.parquet: 100%|██████████| 1253510/1253510 [00:03<00:00, 333777.30it/s, save to ../data/raw/green_tripdata_2025-03.parquet]


In [2]:
jan_data = pd.read_parquet('../data/raw/green_tripdata_2025-01.parquet')
jan_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee
count,48326.0,48326,48326,46490.0,48326.0,48326.0,46490.0,48326.0,48326.0,48326.0,48326.0,48326.0,48326.0,0.0,48326.0,48326.0,46490.0,46483.0,46490.0,46490.0
mean,1.870215,2025-01-17 04:19:01.965753,2025-01-17 04:38:59.277448,1.184771,94.094545,142.409014,1.278103,21.53241,16.762466,0.932324,0.602776,2.481859,0.177461,,0.988346,22.634242,1.266423,1.038788,0.85306,0.05335
min,1.0,2024-12-25 23:13:15,2024-12-25 23:13:17,1.0,3.0,1.0,0.0,0.0,-113.0,-5.0,-0.5,-0.9,0.0,,-1.0,-114.0,1.0,1.0,-2.75,0.0
25%,2.0,2025-01-09 14:00:31.500000,2025-01-09 14:26:33.500000,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,13.705,1.0,1.0,0.0,0.0
50%,2.0,2025-01-16 19:46:55.500000,2025-01-16 20:01:19.500000,1.0,75.0,140.0,1.0,1.74,13.5,0.0,0.5,2.07,0.0,,1.0,18.75,1.0,1.0,0.0,0.0
75%,2.0,2025-01-24 17:47:35.500000,2025-01-24 18:03:52,1.0,97.0,230.0,1.0,2.94,19.1,2.5,0.5,3.69,0.0,,1.0,26.465,2.0,1.0,2.75,0.0
max,2.0,2025-02-05 18:46:24,2025-02-05 19:11:47,99.0,265.0,265.0,9.0,84731.57,336.2,7.5,4.25,252.05,48.76,,1.0,371.4,5.0,2.0,2.75,0.75
std,0.33607,,,1.442502,54.968061,77.25155,0.937178,990.646907,13.308342,1.348587,0.357366,3.213612,1.192984,,0.130402,15.435061,0.471842,0.193092,1.2719,0.192788


In [3]:
jan_data.shape

(48326, 21)

In [4]:
# Create target - Duration in Minutes
jan_data["duration_min"] = jan_data.lpep_dropoff_datetime - jan_data.lpep_pickup_datetime
jan_data.duration_min = jan_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [5]:
# Filter out outliers
jan_data = jan_data[(jan_data.duration_min >= 0) & (jan_data.duration_min <= 60)]
jan_data = jan_data[(jan_data.passenger_count > 0) & (jan_data.passenger_count <= 8)]

# Drop "ehail_fee column"
jan_data = jan_data.drop("ehail_fee", axis=1)

In [None]:
jan_data.duration_min.hist()

In [7]:
# Feb Data
feb_data = pd.read_parquet('../data/raw/green_tripdata_2025-02.parquet')

feb_data["duration_min"] = feb_data.lpep_dropoff_datetime - feb_data.lpep_pickup_datetime
feb_data.duration_min = feb_data.duration_min.apply(lambda td : float(td.total_seconds())/60)
feb_data = feb_data[(feb_data.duration_min >= 0) & (feb_data.duration_min <= 60)]
feb_data = feb_data[(feb_data.passenger_count > 0) & (feb_data.passenger_count <= 8)]
feb_data = feb_data.drop("ehail_fee", axis=1)

In [8]:
# Data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

## Baseline Model

Using `LinearRegression`

In [9]:
train_data = jan_data[:30000]
val_data = jan_data[30000:]

In [10]:
model = LinearRegression()
model.fit(train_data[num_features + cat_features], train_data[target])

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [11]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

3.7376346650222163
3.636921395503821


In [16]:
# Save the model and test data
with open('models/lin_reg.bin', 'wb') as f_out:
  dump(model, f_out)

val_data.to_parquet('../data/clean/reference.parquet')

## Creating evidently report

In [12]:
val_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee,duration_min,prediction
30768,2,2025-01-22 12:07:38,2025-01-22 12:12:20,N,1.0,74,42,1.0,0.47,6.5,...,1.6,0.0,1.0,9.6,1.0,1.0,0.0,0.0,4.7,7.254809
30769,2,2025-01-22 12:52:31,2025-01-22 13:06:00,N,1.0,75,43,1.0,2.03,14.9,...,3.83,0.0,1.0,22.98,1.0,1.0,2.75,0.0,13.483333,11.969174
30770,2,2025-01-22 12:04:37,2025-01-22 12:14:03,N,1.0,41,166,1.0,1.28,10.7,...,2.44,0.0,1.0,14.64,1.0,1.0,0.0,0.0,9.433333,10.196504
30771,1,2025-01-22 12:15:39,2025-01-22 12:19:47,N,1.0,7,7,1.0,0.5,5.8,...,0.0,0.0,1.0,7.3,1.0,1.0,0.0,0.0,4.133333,7.149974
30772,2,2025-01-22 12:08:19,2025-01-22 12:08:40,N,1.0,74,41,1.0,0.1,3.0,...,0.0,0.0,1.0,4.5,2.0,1.0,0.0,0.0,0.35,5.608404


In [13]:
# Define the data definitions
schema = DataDefinition(
  numerical_columns=num_features,
  categorical_columns=cat_features
)

eval_data_jan = Dataset.from_pandas(
  val_data,
  data_definition=schema
)

eval_data_feb = Dataset.from_pandas(
  feb_data,
  data_definition=schema
)

Data Summary Report

In [None]:
report = Report([
  DataSummaryPreset()
])

data_summary_eval = report.run(eval_data_jan, None)
data_summary_eval

In [50]:
# Can also print the JSON 
data_summary_eval.json()

'{"metrics": [{"id": "00404ffe284d7862baa4095093452630", "metric_name": "RowCount()", "config": {"type": "evidently:metric_v2:RowCount"}, "value": 15346.0}, {"id": "91e498263b6502661774b15eb39ea154", "metric_name": "ColumnCount()", "config": {"type": "evidently:metric_v2:ColumnCount"}, "value": 22.0}, {"id": "31b8c78bad2b7108842265cd082d5abf", "metric_name": "ColumnCount(column_type=ColumnType.Numerical)", "config": {"type": "evidently:metric_v2:ColumnCount", "tests": [], "column_type": "num"}, "value": 18.0}, {"id": "bb154bb9a843cad0d72fa3dc4983394d", "metric_name": "ColumnCount(column_type=ColumnType.Categorical)", "config": {"type": "evidently:metric_v2:ColumnCount", "tests": [], "column_type": "cat"}, "value": 1.0}, {"id": "012cbfb269361272d4773e1a68559396", "metric_name": "ColumnCount(column_type=ColumnType.Datetime)", "config": {"type": "evidently:metric_v2:ColumnCount", "tests": [], "column_type": "datetime"}, "value": 2.0}, {"id": "3f5e595e26bc8e120aadf51967bb0355", "metric_nam

Data Drift Report

In [None]:
report = Report([
  DataDriftPreset()
])

# df1: Current data to evaluate | df2: Reference dataset
drift_eval = report.run(eval_data_feb, eval_data_jan)
drift_eval

# Evidently Dashboard

https://docs.evidentlyai.com/quickstart_ml

<u>Steps</u>

1. Create a workspace
2. Create a project
3. Create a report
4. Save the report to the workspace
5. Create the dashboard

In [24]:
from evidently.metrics import *

from evidently.ui.workspace import Workspace
from evidently.sdk.models import PanelMetric
from evidently.sdk.panels import DashboardPanelPlot

In [16]:
ws = Workspace("workspace")

In [17]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "A description"
project.save()

In [20]:
regular_report = Report([
  DataSummaryPreset()
])

jan_summary = regular_report.run(
  reference_data=None,
  current_data=eval_data_jan,
)

In [22]:
report = Report([
  DataDriftPreset()
])

drift_summary = report.run(
  reference_data=eval_data_jan,
  current_data=eval_data_feb
)

In [23]:
# ws.add_run(project.id, jan_summary, include_data=False)
ws.add_run(project.id, drift_summary, include_data=False)

In [30]:
# Create the dashboard
project.dashboard.add_panel(
  DashboardPanelPlot(
    title="Dataset column drift",
    subtitle="Jan and Feb drift",
    size="half",
    values=[
      PanelMetric(
        legend="Share",
        metric="DriftedColumnsCount",
        metric_labels={"value_type": "share"}
      )
    ],
    plot_params={"plot_type": "line"}
  ),
  tab="Data Drift"
)

project.dashboard.add_panel(
  DashboardPanelPlot(
    title="Predition drift",
    subtitle="""Drift in the prediction column ("class"), method: Jensen-Shannon distance""",
    size="half",
    values=[
      PanelMetric(
        legend="Drift score",
        metric="ValueDrift",
        metric_labels={"column": "trip_distance"} 
      ),
    ],
    plot_params={"plot_type": "bar"},
  ),
  tab="Data Drift",
)

In [29]:
project.dashboard.clear_dashboard()

# Testing for Grafana

In [None]:
ref = pd.read_parquet("../data/clean/reference.parquet")
curr = pd.read_parquet("../data/raw/green_tripdata_2025-03.parquet")

In [None]:
schema = DataDefinition(
  numerical_columns=num_features,
  categorical_columns=cat_features
)

df_eval = Dataset.from_pandas(
  df,
  data_definition=schema
)
df_eval

TypeError: DataDefinition.__init__() got an unexpected keyword argument 'prediction'

In [None]:
report = Report([
  DataDriftPreset(),
  DatasetMissingValueCount()
])

report.run(
  reference
)

In [None]:
%load_ext watermark
%watermark