# Homework 5

In [144]:
import requests
import datetime
import pandas as pd
from datetime import datetime

from evidently.ui.workspace import Workspace
from evidently import Report, Regression
from evidently.metrics import QuantileValue, MeanValue, ValueDrift
from evidently import Dataset
from evidently import DataDefinition
from evidently.presets import DataDriftPreset, DataSummaryPreset 

from evidently.sdk.models import PanelMetric
from evidently.sdk.panels import DashboardPanelPlot

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

## Question 1

In [116]:
!mkdir data

mkdir: data: File exists


In [117]:
files = [('green_tripdata_2022-01.parquet', './data'), 
         ('green_tripdata_2022-02.parquet', './data'),
         ('green_tripdata_2022-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2022-01.parquet: 100%|█| 1254291/1254291 [00:02<00:00, 441408.54i
green_tripdata_2022-02.parquet: 100%|█| 1428262/1428262 [00:03<00:00, 455121.26i
green_tripdata_2022-03.parquet: 100%|█| 1615562/1615562 [00:03<00:00, 448566.35i


In [118]:
df = pd.read_parquet('data/green_tripdata_2022-03.parquet')

print(f'Number of rows: {len(df)}')

Number of rows: 78537


In [119]:

def transform(df):
    # create target
    df["duration_min"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration_min = df.duration_min.apply(lambda td : float(td.total_seconds())/60)

    # filter out outliers
    df = df[(df.duration_min >= 0) & (df.duration_min <= 60)]
    df = df[(df.passenger_count > 0) & (df.passenger_count <= 8)]

    return df


In [120]:

def train(df, target, num_features, cat_features):

    train_data = df[:30000]
    val_data = df[30000:]

    model = LinearRegression()
    model.fit(train_data[num_features + cat_features], train_data[target])

    return model, train_data, val_data


def add_predictions(df, model, num_features, cat_features):

    preds = model.predict(df[num_features + cat_features])
    df['prediction'] = preds

    return df


def save_model(model):
    with open('models/lin_reg.bin', 'wb') as f_out:
        dump(model, f_out)


### Train on jan

In [121]:
 
df = pd.read_parquet('data/green_tripdata_2022-01.parquet')

target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

df = transform(df)
model, train_data, val_data = train(df, target, num_features, cat_features)
train_data = add_predictions(train_data, model, num_features, cat_features)
val_data = add_predictions(val_data, model, num_features, cat_features)

save_model(model)

In [122]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

3.804665373785063
4.142064073688449


### Eval other months 

In [123]:

feb_df = pd.read_parquet('data/green_tripdata_2022-02.parquet')
feb_df = transform(feb_df)
feb_df = add_predictions(feb_df, model, num_features, cat_features)

march_df = pd.read_parquet('data/green_tripdata_2022-03.parquet')
march_df = transform(march_df)
march_df = add_predictions(march_df, model, num_features, cat_features)

## Question 2/3/4

In [147]:
ws = Workspace("workspace")
project = ws.create_project("MLOPS zoomcamp")
project.description = "Homework 5 - monitoring"
project.save()

In [148]:
schema = DataDefinition(
    numerical_columns=num_features + ["prediction"],
    categorical_columns=cat_features,
    regression=[Regression(target="duration_min", prediction="prediction")]
    )

train_dataset = Dataset.from_pandas(
    train_data,
    data_definition=schema
)

eval_dataset = Dataset.from_pandas(
    val_data,
    data_definition=schema
)

feb_dataset = Dataset.from_pandas(
    feb_df,
    data_definition=schema
)

march_dataset = Dataset.from_pandas(
    march_df,
    data_definition=schema
)

report = Report([
    DataDriftPreset(), 
    QuantileValue(column='fare_amount'),
    MeanValue(column='trip_distance'),
    ValueDrift(column="prediction")
])

eval_jan = report.run(reference_data=train_dataset, current_data=eval_dataset, timestamp=datetime(2023, 1, 31))

# February evaluation
report_feb = Report([
    DataDriftPreset(),
    QuantileValue(column='fare_amount'),
    MeanValue(column='trip_distance'),
    ValueDrift(column="prediction")
])
eval_feb = report_feb.run(reference_data=train_dataset, current_data=feb_dataset, timestamp=datetime(2023, 2, 28))

# March evaluation
report_mar = Report([
    DataDriftPreset(),
    QuantileValue(column='fare_amount'),
    MeanValue(column='trip_distance'),
    ValueDrift(column="prediction")
])
eval_mar = report_mar.run(reference_data=train_dataset, current_data=march_dataset, timestamp=datetime(2023, 3, 31))

ws.add_run(project.id, eval_jan, include_data=False)
ws.add_run(project.id, eval_feb, include_data=False)
ws.add_run(project.id, eval_mar, include_data=False)

project.dashboard.add_panel(
             DashboardPanelPlot(
                title="Dataset column drift",
                subtitle = "Share of drifted columns",
                size="half",
                values=[
                    PanelMetric(
                        legend="Share",
                        metric="DriftedColumnsCount",
                        metric_labels={"value_type": "share"} 
                    ),
                ],
                plot_params={"plot_type": "line"},
            ),
            tab="Data Drift",
        )
project.dashboard.add_panel(
             DashboardPanelPlot(
                title="Prediction drift",
                subtitle = """Drift in the prediction column ("prediction"), method: Jensen-Shannon distance""",
                size="half",
                values=[
                    PanelMetric(
                        legend="Drift score",
                        metric="ValueDrift",
                        metric_labels={"column": "prediction"} 
                    ),
                ],
                plot_params={"plot_type": "bar"},
            ),
            tab="Data Drift",
        )

project.dashboard.add_panel(
    DashboardPanelPlot(
        title="Fare Amount - 50th Percentile",
        subtitle="Median fare amount over time",
        size="half",
        values=[
            PanelMetric(
                legend="Median",
                metric="QuantileValue",
                metric_labels={
                    "column": "fare_amount",
                    "quantile": "0.5"
                }
            )
        ],
        plot_params={"plot_type": "line"},
    ),
    tab="Feature Stats",
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        title="Trip Distance - Mean",
        subtitle="Average trip distance over time",
        size="half",
        values=[
            PanelMetric(
                legend="Mean",
                metric="MeanValue",
                metric_labels={
                    "column": "trip_distance"
                }
            )
        ],
        plot_params={"plot_type": "line"},
    ),
    tab="Feature Stats",
)
project.save()
