In [31]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

import datetime
from evidently import ColumnMapping, metrics
from evidently.metrics import ColumnDriftMetric, ColumnSummaryMetric, DatasetDriftMetric, DatasetMissingValuesMetric
from evidently.metrics import RegressionQualityMetric, RegressionPredictedVsActualPlot, RegressionErrorPlot
from evidently.metrics import DataDriftTable, TextDescriptorsDriftMetric
from evidently.report import Report
from evidently.metric_preset import TextOverviewPreset, DataQualityPreset, DataDriftPreset
from evidently.test_suite import TestSuite
from evidently.ui.dashboards import CounterAgg, DashboardPanelCounter, DashboardPanelPlot, PanelValue, PlotType, ReportFilter
from evidently.ui.workspace import Workspace, WorkspaceBase

from evidently.test_preset import NoTargetPerformanceTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.test_preset import DataStabilityTestPreset
from evidently.test_preset import DataDriftTestPreset
from evidently.test_preset import RegressionTestPreset
from evidently.test_preset import MulticlassClassificationTestPreset
from evidently.test_preset import BinaryClassificationTopKTestPreset
from evidently.test_preset import BinaryClassificationTestPreset

from evidently.tests import TestColumnDrift, TestMeanInNSigmas
from evidently.tests import TestValueRange, TestValueRMSE

from evidently.descriptors import TextLength, TriggerWordsPresence, OOV, NonLetterCharacterPercentage, WordCount

import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package words to /home/maxou1909/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/maxou1909/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/maxou1909/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/maxou1909/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
def create_report(train, test, i: int, report_type):
    column_mapping = ColumnMapping(
        categorical_features=['experience', 'location'],
        text_features=['title', 'description'],
        target="target"
    )
    
    metrics = []
    if(report_type == "data_quality"):
        metrics = [DataQualityPreset()]
    elif(report_type == "data_drift"):
        metrics = [
            DataDriftPreset(),            
            TextDescriptorsDriftMetric(column_name="title"),
            TextDescriptorsDriftMetric(column_name="description")
        ]
    
    data_drift_report = Report(
        metrics=metrics
    )

    data_drift_report.run(reference_data=train,
                          current_data=test.iloc[100 * i: 100 * (i + 1), :],
                          column_mapping=column_mapping)
    return data_drift_report

In [21]:
def create_test_suite(train, test, i: int, test_type):
    column_mapping = ColumnMapping(
        categorical_features=['experience', 'location'],
        text_features=['title', 'description'],
        target="target"
    )
    
    tests = []
    if(test_type == "data_quality"):
        tests = [DataQualityTestPreset()]
    elif(test_type == "data_drift"):
        tests = [
            TestValueRange(column_name=WordCount().for_column("title")),
            TestValueRange(column_name=OOV().for_column("title")),
            TestValueRange(column_name=TextLength().for_column("title")),

            TestValueRange(column_name=WordCount().for_column("description")),
            TestValueRange(column_name=OOV().for_column("description")),
            TestValueRange(column_name=TextLength().for_column("description")),

            TestColumnDrift(column_name="experience"),
            TestColumnDrift(column_name="location"),
            TestValueRange(column_name="target"),
            TestMeanInNSigmas(column_name="target"),
        ]

    data_drift_test_suite = TestSuite(
        tests=tests
    )

    data_drift_test_suite.run(reference_data=train,
                              current_data=test.iloc[100 * i: 100 * (i + 1), :], column_mapping=column_mapping)
    return data_drift_test_suite

## Project generation

In [4]:
df_train = pd.read_csv("../../../data/train.zip")[0:500].drop(columns="id")
df_test = pd.read_csv("../../../data/train.zip")[500:1000].sample(frac=1).drop(columns="id")

In [23]:
# create_report(df_train, df_test, 0, "data_drift")

In [18]:
# create_test_suite(df_train, df_test, 0, "data_drift")

In [36]:
weeks = ["2023-07-24 2023-07-30", "2023-07-31 2023-08-06", "2023-08-07 2023-08-13", 
         "2023-08-14 2023-08-20", "2023-08-21 2023-08-27"]


for i in tqdm(range(5)):
    if(os.path.exists(os.path.join(os.getcwd(), weeks[0]))):
        shutil.rmtree(weeks[i])
    os.makedirs(weeks[i])
    for report_type in ["data_quality", "data_drift"]:
        os.makedirs(os.path.join(weeks[i], report_type))
        report = create_report(df_train, df_test, i, report_type)
        report.save_html(os.path.join(weeks[i], report_type, f"{report_type}_report.html"))
        
        test = create_test_suite(df_train, df_test, i, report_type)
        test.save_html(os.path.join(weeks[i], report_type, f"{report_type}_test.html"))


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


divide by zero encountered in divide


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:49<00:00,  9.96s/it]
