=================================================

Great expecations ecommerce order analysis

Nama  : Muhammad iqbal      

Pada bagian ini akan dilakukan validasi data untuk melihat kualitas data dengan mengggunakan Great expectations.
Adapun dataset yang dipakai adalah dataset mengenai performa pengiriman jasa pesan antar di platform luar negeri.

=================================================

# Import library

Dilakukan import library yg dibutuhkan untuk great expectations

In [None]:
# Install the library

#!pip install -q "great-expectations==0.18.19" "numpy==1.24.3"

In [1]:
#Import library

import great_expectations as gx
from great_expectations.data_context import FileDataContext
import os

In [2]:
# check versi

gx.__version__

'0.18.19'

# Create data context

Membuat data context sebagai entri poin utama untuk great expectations. Disini saya akan buat di folder baru bernama `great_expectations`

In [3]:
# Make new folder
os.makedirs("./great_expectations", exist_ok=True)

# Create data context
context = FileDataContext.create(project_root_dir = "./great_expectations")
context.list_datasources()

[]

# Connect to datasource

Menghubungkan dataset ke great expectations sebagai datasource

In [4]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-delivery-iqbal'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'delivery-online-iqbal'
path_to_data = './P2M3_muhammad_iqbal_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# Create expectation suite

In [5]:
# Creat an expectation suite
expectation_suite_name = 'expectation-delivery-dataset-iqbal'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,order_id,customer_id,platform,order_date_&_time,delivery_time_minutes,product_category,order_value_inr,customer_feedback,service_rating,delivery_delay,refund_requested
0,ORD000001,CUST2824,JioMart,1900-01-01 00:19:29.500,30,Fruits & Vegetables,382,"Fast delivery, great service!",5,No,No
1,ORD000002,CUST1409,Blinkit,1900-01-01 00:54:29.500,16,Dairy,279,Quick and reliable!,5,No,No
2,ORD000003,CUST5506,JioMart,1900-01-01 00:21:29.500,25,Beverages,599,Items missing from order.,2,No,Yes
3,ORD000004,CUST5012,JioMart,1900-01-01 00:19:29.500,42,Beverages,946,Items missing from order.,2,Yes,Yes
4,ORD000005,CUST4657,Blinkit,1900-01-01 00:49:29.500,30,Beverages,334,"Fast delivery, great service!",5,No,No


# Expectations

Pada bagian ini akan dilakukan validasi data dengan expectations diantarnya :

- to be unique
- to be between min_value and max_value
- to be in set
- to be in type list
- row count to be between min_value and max_value
- mean value to be between min_value and max_value
- regex match

In [6]:
# Expectation 1 : Column `order_id` must be unique

validator.expect_column_values_to_be_unique('order_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 100000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [7]:
# Expectation 2 : Column `service_rating` must be less than equal 5

validator.expect_column_values_to_be_between(
    column='service_rating', min_value=0, max_value=5
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 100000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [8]:
# Expectation 3 : Column `delivery_delay` must contain one of the following 2 things :
# Yes and No 


validator.expect_column_values_to_be_in_set('delivery_delay', ["Yes", "No"])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 100000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [9]:
# Expectation 4 : Column `order_value_inr` must in form of integer64 or float

validator.expect_column_values_to_be_in_type_list('order_value_inr', ['int64', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "result": {
    "observed_value": "int64"
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [10]:
# Expectation 5 : Row count must be between 50000 and 100000

validator.expect_table_row_count_to_be_between(min_value=50000, max_value=100000)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "result": {
    "observed_value": 100000
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [11]:
# Expectation 6 : Column "delivery_time_minutes" the mean value must in between 20 and 30

validator.expect_column_mean_to_be_between('delivery_time_minutes', min_value=20, max_value=30)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "result": {
    "observed_value": 29.53614
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [12]:
# Expectation 7 : Column "customer_id" must have `CUST` and then followed by number 

validator.expect_column_values_to_match_regex('customer_id', r'^CUST[0-9]')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 100000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [13]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

In [14]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://d:\\FTDS hacktiv8\\Phase 2\\Challenge and assigment\\project-m3\\great_expectations\\gx\\uncommitted/data_docs/local_site/index.html'}