In [2]:
"""
This script is to provide analysis on dataset quality using preprocessed cleaned_data.csv

"""

import great_expectations as gx
import pandas as pd
import numpy as np
# entrypoint to interact with gx environment
context = gx.get_context(mode="file")

In [3]:
# connect to processed data
datasource = context.data_sources.add_or_update_pandas(
    name="clean_dataset",
)

data_asset = datasource.add_dataframe_asset(name="processed_data")
batch_definition = data_asset.add_batch_definition_whole_dataframe("data")

In [4]:
expectations_suite = gx.ExpectationSuite("data_validation")
context.suites.add(expectations_suite)

{
  "name": "data_validation",
  "id": "8b326039-acb4-4399-8d1a-f48905ea5b40",
  "expectations": [],
  "meta": {
    "great_expectations_version": "1.1.3"
  },
  "notes": null
}

In [5]:
url_pattern = r"https?://\S+|www\.\S+" # Matches URLs    
number_pattern = r"\d"                   # Matches any digit
emoticon_pattern = r"[:;=8][\-o\*\']?[)\](DPp/\\]"  # Matches common emoticons

# Adding expectations
expectations_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(column="positive", value_set=[0,1])
)
expectations_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="positive")
)
# Expect no URLs in the text column
expectations_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotMatchRegex(
        column="cleaned_text",
        regex=url_pattern,
        mostly=1.0  # Expect 100% of values to pass this test
    )
)

# Expect no numbers in the text column
expectations_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotMatchRegex(
        column="cleaned_text",
        regex=number_pattern,
        mostly=1.0
    )
)
# Expect no emoticons in the text column
expectations_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotMatchRegex(
        column="cleaned_text",
        regex=emoticon_pattern,
        mostly=1.0
    )
)

expectations_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="cleaned_text")
)


ExpectColumnValuesToNotBeNull(id='5ffad5ca-00ed-4e10-8775-c1330ffe7fcd', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, row_condition=None, condition_parser=None, column='cleaned_text', mostly=1.0)

In [6]:
validator = gx.ValidationDefinition(
    data=batch_definition, suite=expectations_suite, name="data_validator"
)
context.validation_definitions.add(validator)

ValidationDefinition(name='data_validator', data=BatchDefinition(id=UUID('61f583a8-2ea0-4e6f-b487-4c7b0c1b7a94'), name='data', partitioner=None), suite={
  "name": "data_validation",
  "id": "8b326039-acb4-4399-8d1a-f48905ea5b40",
  "expectations": [
    {
      "type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "column": "positive",
        "value_set": [
          0,
          1
        ]
      },
      "meta": {},
      "id": "40bcfa3c-56c9-4926-9870-cf7631b8a5e2"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "positive"
      },
      "meta": {},
      "id": "b26e51e6-4269-4413-b8cd-4be524a4c7ff"
    },
    {
      "type": "expect_column_values_to_not_match_regex",
      "kwargs": {
        "column": "cleaned_text",
        "regex": "https?://\\S+|www\\.\\S+"
      },
      "meta": {},
      "id": "3f1c9e76-fb2b-4496-b037-48a7e5ff13e7"
    },
    {
      "type": "expect_column_values_to_not_match_regex",
   

In [7]:
action_list = [
    gx.checkpoint.UpdateDataDocsAction(name="update_data_docs"),
]
checkpoint = gx.Checkpoint(
    name="data_checkpoint",
    validation_definitions=[validator],
    actions=action_list,
    result_format={"result_format": "SUMMARY"},
)
context.checkpoints.add(checkpoint)

Checkpoint(name='data_checkpoint', validation_definitions=[ValidationDefinition(name='data_validator', data=BatchDefinition(id=UUID('61f583a8-2ea0-4e6f-b487-4c7b0c1b7a94'), name='data', partitioner=None), suite={
  "name": "data_validation",
  "id": "8b326039-acb4-4399-8d1a-f48905ea5b40",
  "expectations": [
    {
      "type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "column": "positive",
        "value_set": [
          0,
          1
        ]
      },
      "meta": {},
      "id": "40bcfa3c-56c9-4926-9870-cf7631b8a5e2"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "positive"
      },
      "meta": {},
      "id": "b26e51e6-4269-4413-b8cd-4be524a4c7ff"
    },
    {
      "type": "expect_column_values_to_not_match_regex",
      "kwargs": {
        "column": "cleaned_text",
        "regex": "https?://\\S+|www\\.\\S+"
      },
      "meta": {},
      "id": "3f1c9e76-fb2b-4496-b037-48a7e5ff13e7"
    },
    {
 

In [8]:
#input_dir = PROCESSED_DATA_DIR / "dataset"
data_to_test = pd.read_csv('/Users/dinara/MLOPS_Team4/data/processed/cleaned_data.csv')
X = data_to_test["cleaned_text"]
y = data_to_test["positive"]
#x_train = pd.read_csv(input_dir / "X_train.csv")
#y_train = pd.read_csv(input_dir / "y_train.csv")

dataframe = pd.concat([X, y], axis=1)

context = gx.get_context(mode="file")

checkpoint = context.checkpoints.get("data_checkpoint")

batch_parameters = {"dataframe": dataframe}
checkpoint.run(batch_parameters=batch_parameters)

  self.comm = Comm(**args)


Calculating Metrics:   0%|          | 0/38 [00:00<?, ?it/s]

CheckpointResult(run_id={"run_name": null, "run_time": "2024-10-28T18:57:39.711522+01:00"}, run_results={ValidationResultIdentifier::data_validation/__none__/20241028T175739.711522Z/clean_dataset-processed_data: {
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "batch_id": "clean_dataset-processed_data",
          "column": "positive",
          "value_set": [
            0,
            1
          ]
        },
        "meta": {},
        "id": "40bcfa3c-56c9-4926-9870-cf7631b8a5e2"
      },
      "result": {
        "element_count": 1600000,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_i