## Setup

In [1]:
import os
import yaml
import json
from pprint import pprint

import pandas as pd

import great_expectations as gx



In [2]:
# Globals
DATA_DIR = "app/data"
ANIME_DATA = os.path.join(DATA_DIR, "anime.csv")
RATING_DATA = os.path.join(DATA_DIR, "rating.csv")

In [3]:
def load_csv(path: str) -> pd.DataFrame:
    """
    Load a CSV file as a pandas DataFrame.

    Args:
    - path (str): The path to the CSV file.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing the CSV data.

    Raises:
    - FileNotFoundError: If the file is not found at the specified path.

    """
    try:
        return pd.read_csv(path)
    except FileNotFoundError:
        print("File not found")

## Great Expectations configuration

In [4]:
# Set up
context = gx.get_context()

In [5]:
# Generate a Great Expectations configuration file by running:
! great_expectations init

Using v3 (Batch Request) API
[36m
  ___              _     ___                  _        _   _
 / __|_ _ ___ __ _| |_  | __|_ ___ __  ___ __| |_ __ _| |_(_)___ _ _  ___
| (_ | '_/ -_) _` |  _| | _|\ \ / '_ \/ -_) _|  _/ _` |  _| / _ \ ' \(_-<
 \___|_| \___\__,_|\__| |___/_\_\ .__/\___\__|\__\__,_|\__|_\___/_||_/__/
                                |_|
             ~ Always know what to expect from your data ~
[0m
This looks like an existing project that [32mappears complete![0m You are [32mready to roll.[0m

[0m

## Anime expectation

In [6]:
anime_data = load_csv(ANIME_DATA)

In [7]:
anime_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [8]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [9]:
anime_data.rating.describe()

count    12064.000000
mean         6.473902
std          1.026746
min          1.670000
25%          5.880000
50%          6.570000
75%          7.180000
max         10.000000
Name: rating, dtype: float64

In [29]:
# Connect to data
anime_validator = context.sources.pandas_default.read_csv(
    ANIME_DATA
)

In [30]:
anime_validator.expect_column_values_to_be_between("rating", auto=True)




Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "rating",
      "min_value": 1.67,
      "strict_max": false,
      "mostly": 1.0,
      "strict_min": false,
      "max_value": 10.0
    },
    "expectation_type": "expect_column_values_to_be_between",
    "meta": {
      "auto_generated_at": "20230330T162851.854831Z",
      "great_expectations_version": "0.16.3"
    }
  },
  "meta": {},
  "result": {
    "element_count": 12294,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 230,
    "missing_percent": 1.8708312998210508,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  }
}

In [12]:
# Create Expectations
anime_checkpoint = gx.checkpoint.SimpleCheckpoint( 
    name="anime_checkpoint",
    data_context=context,
    validator=anime_validator,
)

In [13]:
anime_checkpoint_result = anime_checkpoint.run()

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
# View results
anime_validation_result_identifier = anime_checkpoint_result.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=anime_validation_result_identifier)

## Rating expectation

In [16]:
rating_data = load_csv(ANIME_DATA)

In [17]:
rating_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [18]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [19]:
rating_data.rating.describe()

count    12064.000000
mean         6.473902
std          1.026746
min          1.670000
25%          5.880000
50%          6.570000
75%          7.180000
max         10.000000
Name: rating, dtype: float64

In [20]:
# Connect to data
rating_validator = context.sources.pandas_default.read_csv(
    RATING_DATA
)

In [21]:
rating_validator.expect_column_values_to_be_between("rating", auto=True)




Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "rating",
      "min_value": -1,
      "strict_max": false,
      "mostly": 1.0,
      "strict_min": false,
      "max_value": 10
    },
    "expectation_type": "expect_column_values_to_be_between",
    "meta": {
      "auto_generated_at": "20230330T162222.196090Z",
      "great_expectations_version": "0.16.3"
    }
  },
  "meta": {},
  "result": {
    "element_count": 7813737,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  }
}

In [22]:
# Create Expectations
rating_checkpoint = gx.checkpoint.SimpleCheckpoint( 
    name="rating_checkpoint",
    data_context=context,
    validator=rating_validator,
)

In [23]:
# View results
rating_validation_result_identifier = anime_checkpoint_result.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=rating_validation_result_identifier)

## Great Expectations configuration

In [None]:
# Generate a Great Expectations configuration file by running:
# ! great_expectations init

In [None]:
import pickle
with open("./anime.pkl", "wb") as f:
    pickle.dump(anime_data, f)


In [None]:
with open("./anime.pkl", "rb") as f:
        # Using pickle to load the model object from the binary file
        anime_data = pickle.load(f)

In [None]:
context = ge.data_context.DataContext()

In [None]:
# my_expectations = context.create_expectation_suite("anime_expectation")

In [None]:
print(context.list_expectation_suite_names())
print([datasource["name"] for datasource in context.list_datasources()])
print(context.list_checkpoints())

In [None]:
with open("great_expectations/great_expectations.yml", "r") as stream:
    try:
        ge_config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

pprint(ge_config["datasources"], indent=0)

## Anime expectation

In [None]:
with open("great_expectations/checkpoints/anime-checkpoint.yml", "r") as stream:
    try:
        anime_chkp_config = yaml.safe_load(stream)
        pprint(anime_chkp_config)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
!great_expectations check-config

In [None]:
context.open_data_docs()