# Evidently Metrics 

In [None]:
try:
    import evidently
except:
    !pip install git+https://github.com/evidentlyai/evidently.git

In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn import ensemble
from sklearn import model_selection

from evidently import ColumnMapping
from evidently.options import ColorOptions
from evidently.report import Report

from evidently.metrics import ColumnDriftMetric

from evidently.metrics import DataDriftTable
from evidently.metrics import DatasetDriftMetric
from evidently.metrics import ColumnDistributionMetric
from evidently.metrics import ColumnValuePlot
from evidently.metrics import ColumnQuantileMetric
from evidently.metrics import ColumnCorrelationsMetric
from evidently.metrics import ColumnValueListMetric
from evidently.metrics import ColumnValueRangeMetric
from evidently.metrics import DatasetCorrelationsMetric
from evidently.metrics import ColumnRegExpMetric
from evidently.metrics import ColumnSummaryMetric
from evidently.metrics import ColumnMissingValuesMetric
from evidently.metrics import DatasetSummaryMetric
from evidently.metrics import DatasetMissingValuesMetric
from evidently.metrics import ConflictTargetMetric
from evidently.metrics import ConflictPredictionMetric
from evidently.metrics import ClassificationQualityMetric
from evidently.metrics import ClassificationClassBalance
from evidently.metrics import ClassificationConfusionMatrix
from evidently.metrics import ClassificationQualityByClass
from evidently.metrics import ClassificationClassSeparationPlot
from evidently.metrics import ClassificationProbDistribution
from evidently.metrics import ClassificationRocCurve
from evidently.metrics import ClassificationPRCurve
from evidently.metrics import ClassificationPRTable
from evidently.metrics import ClassificationQualityByFeatureTable
from evidently.metrics import RegressionQualityMetric
from evidently.metrics import RegressionPredictedVsActualScatter
from evidently.metrics import RegressionPredictedVsActualPlot
from evidently.metrics import RegressionErrorPlot
from evidently.metrics import RegressionAbsPercentageErrorPlot
from evidently.metrics import RegressionErrorDistribution
from evidently.metrics import RegressionErrorNormality
from evidently.metrics import RegressionTopErrorMetric
from evidently.metrics import RegressionErrorBiasTable

## Prepare a Dataset

In [2]:
#Dataset for Data Quality and Integrity
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')
adult = adult_data.frame

adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

adult_cur.iloc[:2000, 3:5] = np.nan

  warn(


In [3]:
from catboost import CatBoostClassifier

In [4]:
#Dataset for binary label and probabilistic classifcation
bcancer_data = datasets.load_breast_cancer(as_frame='auto')
bcancer = bcancer_data.frame

bcancer_ref = bcancer.sample(n=300, replace=False)
bcancer_cur = bcancer.sample(n=200, replace=False)

bcancer_label_ref = bcancer_ref.copy(deep=True)
bcancer_label_cur = bcancer_cur.copy(deep=True)

model = CatBoostClassifier(random_state=1, n_estimators=10)
model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target)

bcancer_ref['prediction'] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1]
bcancer_cur['prediction'] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1]

bcancer_label_ref['prediction'] = model.predict(bcancer_label_ref[bcancer_data.feature_names.tolist()])
bcancer_label_cur['prediction'] = model.predict(bcancer_label_cur[bcancer_data.feature_names.tolist()])

Learning rate set to 0.420402
0:	learn: 0.2952804	total: 51.6ms	remaining: 465ms
1:	learn: 0.1423469	total: 54.5ms	remaining: 218ms
2:	learn: 0.0809261	total: 58ms	remaining: 135ms
3:	learn: 0.0581116	total: 61.3ms	remaining: 91.9ms
4:	learn: 0.0567674	total: 65.9ms	remaining: 65.9ms
5:	learn: 0.0405328	total: 70.6ms	remaining: 47.1ms
6:	learn: 0.0375171	total: 74.6ms	remaining: 32ms
7:	learn: 0.0278968	total: 77ms	remaining: 19.2ms
8:	learn: 0.0254498	total: 79.7ms	remaining: 8.86ms
9:	learn: 0.0197259	total: 82.8ms	remaining: 0us


In [5]:
#Dataset for regression
housing_data = datasets.fetch_california_housing(as_frame='auto')
housing = housing_data.frame

housing.rename(columns={'MedHouseVal': 'target'}, inplace=True)
housing['prediction'] = housing_data['target'].values + np.random.normal(0, 3, housing.shape[0])

housing_ref = housing.sample(n=5000, replace=False)
housing_cur = housing.sample(n=5000, replace=False)

## How to run Reports?

### Data Drift Metrics

In [None]:
#dataset-level metrics
data_drift_dataset_report = Report(metrics=[
    DatasetDriftMetric(),
    DataDriftTable(),    
])

data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_report

In [None]:
#report in a JSON format
data_drift_dataset_report.json()

In [None]:
#report as a python object
data_drift_dataset_report.as_dict()

In [None]:
#column-level metrics
data_drift_column_report = Report(metrics=[
    ColumnDriftMetric('age'),
    ColumnValuePlot('age'),  
])

data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_column_report

### Data Quality Metrics

In [None]:
#dataset-level metrics
data_quality_dataset_report = Report(metrics=[
    DatasetCorrelationsMetric(),
    
])

data_quality_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_quality_dataset_report

In [None]:
#column-level metrics
data_quality_column_report = Report(metrics=[
    ColumnDistributionMetric(column_name="education"), 
    ColumnQuantileMetric(column_name="education-num", quantile=0.75), 
    ColumnCorrelationsMetric(column_name="education"),
    ColumnValueListMetric(column_name="relationship", values=["Husband", "Unmarried"]), 
    ColumnValueRangeMetric(column_name="age", left=10, right=20),
    
])

data_quality_column_report.run(reference_data=adult_ref, current_data=adult_cur)
data_quality_column_report

### Data Integrity Metrics

In [None]:
#dataset-level metrics
data_integrity_dataset_report = Report(metrics=[
    DatasetSummaryMetric(),
    DatasetMissingValuesMetric()
    
])

data_integrity_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_integrity_dataset_report

In [None]:
#column-level metrics
data_integrity_column_report = Report(metrics=[
    ColumnRegExpMetric(column_name="relationship", reg_exp=r".*child.*"),
    ColumnSummaryMetric(column_name="age"),
    ColumnMissingValuesMetric(column_name="education"),

    
])

data_integrity_column_report.run(reference_data=adult_ref, current_data=adult_cur)
data_integrity_column_report

### Classification Metrics

In [6]:
#label binary classification
classification_report = Report(metrics=[
    ClassificationQualityMetric(),
    ClassificationClassBalance(),
    ConflictTargetMetric(),
    ConflictPredictionMetric(),
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
    ClassificationQualityByFeatureTable(),
])

classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur)
# classification_report

In [None]:
classification_report.as_dict()

In [5]:
#probabilistic binary classification
classification_report = Report(metrics=[
    ClassificationQualityMetric(),
    ClassificationClassBalance(),
    ConflictTargetMetric(),
    ConflictPredictionMetric(),
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
    ClassificationClassSeparationPlot(),
    ClassificationProbDistribution(),
    ClassificationRocCurve(),
    ClassificationPRCurve(),
    ClassificationPRTable(),
    ClassificationQualityByFeatureTable(),

    
])

classification_report.run(reference_data=bcancer_ref, current_data=bcancer_cur)
# classification_report

In [6]:
from box import ConfigBox
report_data = ConfigBox(classification_report.as_dict())

In [7]:
test_data_result =  report_data.metrics[0].result.current
train_data_result =  report_data.metrics[0].result.reference

In [8]:
test_json = {f"test_{key}" : data for key , data in train_data_result.items()}


In [9]:
train_json = {f"train_{key}" : data for key , data in test_data_result.items()}

In [10]:
from CreditCard.utils import read_yaml_as_dict , write_yaml , write_json
from pathlib import Path

In [11]:
data_to_dump = {**test_json , **train_json}

In [12]:
type(data_to_dump)

dict

In [13]:
write_json(data=data_to_dump ,file_path= Path("./test.json"))

In [None]:
from box import ConfigBox
data = ConfigBox(classification_report.as_dict())
test_data_result = data.metrics[0].result.current
train_data_result = data.metrics[0].result.reference

In [None]:

train_data_result["roc_auc"] -  test_data_result['roc_auc']

In [15]:
config_data = read_yaml_as_dict(Path("/home/pk/project-python/creditcard/params.yaml"))

2023-03-29 07:01:15.309 | INFO     | CreditCard.utils.common:read_yaml_as_dict:58 - yaml file: /home/pk/project-python/creditcard/params.yaml loaded successfully


### Regreission Metrics

In [17]:
data_to_dump["test_accuracy"]

1.0

In [18]:
config_data["model_evaluation_config"]["base_accuracy"] =float(data_to_dump["test_accuracy"])

In [19]:
write_yaml(data=config_data , file_path=Path("/home/pk/project-python/creditcard/params.yaml"))

In [None]:
regression_report = Report(metrics=[
    RegressionQualityMetric(),
    RegressionPredictedVsActualScatter(),
    RegressionPredictedVsActualPlot(),
    RegressionErrorPlot(),
    RegressionAbsPercentageErrorPlot(),
    RegressionErrorDistribution(),
    RegressionErrorNormality(),
    RegressionTopErrorMetric(),
    RegressionErrorBiasTable(columns=['MedInc', 'AveRooms']),
    ConflictTargetMetric(),
    ConflictPredictionMetric(),
    
])

regression_report.run(reference_data=housing_ref, current_data=housing_cur)
regression_report

### How to set metric parameters?

In [None]:
#simple metric parameters
data_integrity_column_report = Report(metrics=[
    ColumnRegExpMetric(column_name="education", reg_exp=r".*-.*", top=5),
    ColumnRegExpMetric(column_name="relationship", reg_exp=r".*child.*")
])

data_integrity_column_report.run(reference_data=adult_ref, current_data=adult_cur)
data_integrity_column_report

In [None]:
#options
color_scheme = ColorOptions()
color_scheme.primary_color = "#5a86ad"
color_scheme.fill_color = "#fff4f2"
color_scheme.zero_line_color = "#016795"
color_scheme.current_data_color = "#c292a1" 
color_scheme.reference_data_color = "#017b92"

In [None]:
data_drift_column_report = Report(metrics=[
    ColumnDriftMetric('age'),
    ColumnDriftMetric('age', stattest='psi'),
],
                                  options=[color_scheme]
)

data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_column_report