This notebook demonstrates the use of Evidently AI for analyzing and monitoring datasets and machine learning models. Specifically, it covers:
- Loading datasets with a defined schema
- Creating reports with various metrics and presets
- Analyzing data quality, data drift, and regression performance

In [None]:
import pandas as pd
from evidently import Report, DataDefinition, Dataset
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
def load_dataset(file_path: str, schema: DataDefinition) -> Dataset:
    df = pd.read_csv(file_path)
    print(f"Loaded dataset {Path(file_path).name} with {len(df)} rows")
    return Dataset.from_pandas(df, data_definition=schema)

## Load datasets

In [None]:
from evidently import Regression

my_schema = DataDefinition(
    numerical_columns=["engines", "passenger_capacity", "crew", "price", "company_rating"],
    categorical_columns=["d_check_complete", "moon_clearance_complete", "iata_approved"], regression=[Regression(target="price", prediction="prediction")]
)

In [None]:
from pathlib import Path

base_dir = Path().cwd()
file_path = base_dir / "data" / "reference_table_and_target.csv"
ref = load_dataset(file_path, my_schema)

file_path = base_dir / "data" / "current_table_and_target_20251004_125214.csv"
curr = load_dataset(file_path, my_schema)

## Example 1 - Report Metrics

In [None]:
from evidently.metrics import MeanValue, CategoryCount, MissingValueCount, RowCount

report = Report([
    RowCount(),
    MeanValue(column="passenger_capacity"),  # Column-level
    CategoryCount(column="iata_approved",category=True),   # Column-level.
                                                        # Counts occurrences of the specified category or categories. 
    MissingValueCount(column="crew")  # Column-level
])

report.run(current_data=curr, reference_data=ref)

## Example 2 - Report Presets

In [None]:
from evidently.presets import DatasetStats, ValueStats

report = Report([
    DatasetStats(),     # Small Preset, dataset-level. 
                        # Calculates descriptive dataset stats, including columns by type, rows, missing values, empty columns, etc.
    ValueStats(column="price"),     # Small Preset, column-level. 
                                    # Included Metrics: UniqueValueCount, MissingValueCount, MinValue, MaxValue, MeanValue, StdValue, QuantileValue (0.25, 0.5, 0.75)
])

report.run(current_data=curr, reference_data=ref)

## Example 3 - Dataset data quality

In [None]:
from evidently.metrics import EmptyRowsCount, DuplicatedRowCount, DatasetMissingValueCount

report = Report([
    EmptyRowsCount(),
    DuplicatedRowCount(),
    DatasetMissingValueCount()
])

report.run(current_data=curr, reference_data=ref)

## Example 4 - Data Drift

In [None]:
from evidently.metrics import ValueDrift, DriftedColumnsCount
from evidently.presets import DataDriftPreset

report = Report([
    DataDriftPreset(drift_share=0.7),   # Large Preset. 
                                        # This will detect dataset drift if over 70% columns are drifting
    DataDriftPreset(num_threshold=0.3), # Only numerical columns, dataset-level.
    
    ValueDrift(column="price"),  # Column-level
    DriftedColumnsCount()
])

report.run(current_data=curr, reference_data=ref)

## Example 5 - Regression Performance
The Report with RegressionPreset includes:
- Various metrics: Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), etc.
- Various visualizations: Actual vs Predicted Plot, Error Distribution, Error Normality, etc.

In [None]:
from evidently.presets import RegressionPreset

report = Report([
    RegressionPreset(),
])

report.run(curr, ref)