In [2]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.




import pandas as pd
import great_expectations as ge
from great_expectations.core import ExpectationConfiguration
from great_expectations.exceptions import DataContextError

# Sample DataFrame for demonstration
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva']
}

df = pd.DataFrame(data)

# 19. Setting Up Expectations
# Create a Great Expectations DataFrame (GEDataFrame)
ge_df = ge.from_pandas(df)

# Create a new expectation suite for our data
context = ge.data_context.DataContext.create(project_config="my_project")

# Check if the expectation suite exists and create it if not
try:
    context.create_expectation_suite(expectation_suite_name="my_suite")
except DataContextError:
    pass  # Expectation suite already exists

# Add expectations
ge_df.expect_column_values_to_be_in_set("Name", ["Alice", "Bob", "Charlie", "David", "Eva", "John"])
ge_df.expect_column_values_to_be_in_set("Age", [25, 30, 35, 40, 45])

# 20. Testing for Expectation (Range-based Validation)
# Example: Expect 'Age' to be between 20 and 50
ge_df.expect_column_values_to_be_between("Age", min_value=20, max_value=50)

# Validate the dataset
results = ge_df.validate()

# List unmet expectations
print("Unmet expectations:")
for result in results['results']:
    if not result['success']:
        print(result['expectation_config']['expectation_type'])

# 21. Generating Data Docs
# Automatically generate Data Docs to visualize the data quality results

# This generates a Data Docs site that includes visualizations for our expectations and validation results
context.build_data_docs()

print("\nData Docs have been generated successfully!")

# If you want to preview the generated docs locally, you can open them via the following command:
# context.open_data_docs()



ImportError: cannot import name 'ExpectationConfiguration' from 'great_expectations.core' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/core/__init__.py)

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.








