In [None]:
import io
import numpy as np
import os
import pandas as pd
from pathlib import Path
import requests
import zipfile

from datetime import datetime
from sklearn import datasets, ensemble

from evidently.dashboard import Dashboard
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.dashboard.tabs import (
    DataDriftTab,
    NumTargetDriftTab,
)

from evidently.metrics import (
    RegressionQualityMetric,
    RegressionPredictedVsActualScatter,
    RegressionPredictedVsActualPlot,
    RegressionErrorPlot,
    RegressionAbsPercentageErrorPlot,
    RegressionErrorDistribution,
    RegressionErrorNormality,
    RegressionTopErrorMetric,
    RegressionErrorBiasTable,
    
    DatasetSummaryMetric,
    ColumnSummaryMetric,
    DatasetMissingValuesMetric,
    DatasetCorrelationsMetric
)
from evidently.report import Report

## Bicycle Demand Data

### Download and extract (unzip) data

This step automatically downloads the bike dataset from UCI. This version is slightly different from the dataset used in Kaggle competition. If you want the example to be identical to the one in the Evidently blog "How to break a model in 20 days", you can manually download the dataset from Kaggle: https://www.kaggle.com/c/bike-sharing-demand/data 

And add this code:

raw_data['mnth'] = raw_data.index.map(lambda x : x.month)

raw_data['hr'] = raw_data.index.map(lambda x : x.hour)

raw_data['weekday'] = raw_data.index.map(lambda x : x.weekday() + 1)

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip -P ../data/

In [None]:
!unzip ../data/Bike-Sharing-Dataset.zip -d ../data/

### Load data

In [None]:
raw_data = pd.read_csv("../data/hour.csv", header=0, sep=',', parse_dates=['dteday'], index_col='dteday')

In [None]:
raw_data.head()

In [None]:
# Get weeks number
days = len(raw_data.index.unique())
weeks = days / 7

print(f'days = {days}; weeks = {weeks}')

## Regression Model

### Config

In [None]:
REF_MONTH_START = '2011-01-01'
REF_MONTH_END = '2011-01-28'

CUR_MONTH_START = '2011-01-29'
CUR_MONTH_END = '2011-02-28'

# CUR_WEEK_START = '2011-01-29'
# CUR_WEEK_END = '2011-02-04'
# CUR_WEEK_START = '2011-02-05'
# CUR_WEEK_END = '2011-02-11'
CUR_WEEK_START = '2011-02-12'
CUR_WEEK_END = '2011-02-18'

target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday']

reports_dir = Path('../reports') / f'{CUR_WEEK_START}_{CUR_WEEK_END}'
reports_dir.mkdir(exist_ok=True)

### Model training

In [None]:
reference = raw_data.loc[REF_MONTH_START:REF_MONTH_END]
current = raw_data.loc[CUR_MONTH_START:CUR_MONTH_END]

In [None]:
reference.shape

In [None]:
reference.head()

In [None]:
regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)

In [None]:
regressor.fit(reference[numerical_features + categorical_features], reference[target])

In [None]:
ref_prediction = regressor.predict(reference[numerical_features + categorical_features])
current_prediction = regressor.predict(current[numerical_features + categorical_features])

In [None]:
reference['prediction'] = ref_prediction
current['prediction'] = current_prediction

## Define columns mapping

In [None]:
column_mapping = ColumnMapping()

column_mapping.target = target
column_mapping.prediction = prediction
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

## Model perfomance

In [None]:
regression_performance_report_dir = reports_dir / 'model_performance'
regression_performance_report_dir.mkdir(exist_ok=True)

### Quality metric

In [None]:
regression_quality_metric_report = Report(metrics=[
    RegressionQualityMetric()
])

regression_quality_metric_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

regression_quality_metric_report_path = regression_performance_report_dir / 'quality_metric.html'
regression_quality_metric_report.save_html(regression_quality_metric_report_path)

### Predicted vs actual

In [None]:
regression_predicted_vs_actual_report = Report(metrics=[
    RegressionPredictedVsActualScatter(),
    RegressionPredictedVsActualPlot()
])

regression_predicted_vs_actual_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

regression_predicted_vs_actual_report_path = regression_performance_report_dir / 'predicted_vs_actual.html'
regression_predicted_vs_actual_report.save_html(regression_predicted_vs_actual_report_path)


### Errors

In [None]:
regression_errors_report = Report(metrics=[
    RegressionErrorPlot(),
    RegressionAbsPercentageErrorPlot(),
    RegressionErrorDistribution(),
    RegressionErrorNormality(),
    RegressionTopErrorMetric(),
    RegressionErrorBiasTable()
])

regression_errors_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

regression_errors_report_path = regression_performance_report_dir / 'errors.html'
regression_errors_report.save_html(regression_errors_report_path)

##  Target drift

In [None]:
target_drift_dashboard = Dashboard(tabs=[NumTargetDriftTab()])
target_drift_dashboard.calculate(reference, current.loc[CUR_WEEK_START:CUR_WEEK_END], 
                                   column_mapping=column_mapping)

In [None]:
target_drift_dashboard.show()

In [None]:
target_drift_report_path = reports_dir / 'target_drift.html'
target_drift_dashboard.save(target_drift_report_path)

## Data drift

In [None]:
data_drift_dashboard = Dashboard(tabs=[DataDriftTab()])
data_drift_dashboard.calculate(reference, current.loc[CUR_WEEK_START:CUR_WEEK_END], 
                                   column_mapping=column_mapping)

In [None]:
data_drift_dashboard.show()

In [None]:
data_drift_report_path = reports_dir / 'data_drift.html'
data_drift_dashboard.save(data_drift_report_path)

## Data quality

In [None]:
data_quality_report_dir = reports_dir / 'data_quality'
data_quality_report_dir.mkdir(exist_ok=True)

### Data summary

In [None]:
data_summary_report = Report(metrics=[
    DatasetSummaryMetric()
])

data_summary_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

data_summary_report_path = data_quality_report_dir / 'data_summary.html'
data_summary_report.save_html(data_summary_report_path)

### Column summary

In [None]:
column_summary_report = Report(metrics=[
    ColumnSummaryMetric(column_name=col)
    for col in numerical_features
])

column_summary_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

column_summary_report_path = data_quality_report_dir / 'column_summary.html'
column_summary_report.save_html(column_summary_report_path)

### Data correlation

In [None]:
data_correlation_report = Report(metrics=[
    DatasetCorrelationsMetric()
])

data_correlation_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

data_correlation_report_path = data_quality_report_dir / 'data_correlation.html'
data_correlation_report.save_html(data_correlation_report_path)

### Missing values

In [None]:
missing_values_report = Report(metrics=[
    DatasetMissingValuesMetric()
])

missing_values_report.run(
    reference_data=reference,
    current_data=current.loc[CUR_WEEK_START:CUR_WEEK_END],
    column_mapping=column_mapping
)

missing_values_report_path = data_quality_report_dir / 'missing_values.html'
missing_values_report.save_html(missing_values_report_path)