# GREAT-EXPECTATIONS - notebook exploration

* Tutorial Source: http://webcache.googleusercontent.com/search?q=cache:https://towardsdatascience.com/getting-started-with-great-expectations-a-guide-to-data-validation-in-python-95a8ffc2b747
* https://legacy.docs.greatexpectations.io/en/latest/guides/tutorials/explore_expectations_in_a_notebook.html

**ROTEIRO**

1. Criação do gx.PandasDataFrame
2. Expectations
    * Match columns name
    * Match columns data type
    * Not null

In [2]:
import pandas
import great_expectations as gx

In [3]:
test_df = gx.read_csv(r'data\test.csv')
test_df.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.0,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,5002.0,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,6048.0,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,0.28,acre,98188,565000.0
4,3,3.5,1942.0,sqft,1603.0,sqft,98107,1187000.0


# Expectations

In [4]:
# Nome das colunas segundo Template
l_train_column_names = ['id','neighbourhood','room_type','price',
 'minimum_nights','number_of_reviews','last_review','availability_365']

test_df.expect_table_columns_to_match_set(column_set=l_train_column_names)

{
  "success": false,
  "result": {
    "observed_value": [
      "baths",
      "beds",
      "lot_size",
      "lot_size_units",
      "price",
      "size",
      "size_units",
      "zip_code"
    ],
    "details": {
      "mismatched": {
        "unexpected": [
          "baths",
          "beds",
          "lot_size",
          "lot_size_units",
          "size",
          "size_units",
          "zip_code"
        ],
        "missing": [
          "availability_365",
          "id",
          "last_review",
          "minimum_nights",
          "neighbourhood",
          "number_of_reviews",
          "room_type"
        ]
      }
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# No null values
test_df.expect_column_values_to_not_be_null('price')

{
  "success": true,
  "result": {
    "element_count": 505,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
test_df.info()

<class 'great_expectations.dataset.pandas_dataset.PandasDataset'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            505 non-null    int64  
 1   baths           505 non-null    float64
 2   size            505 non-null    float64
 3   size_units      505 non-null    object 
 4   lot_size        428 non-null    float64
 5   lot_size_units  428 non-null    object 
 6   zip_code        505 non-null    int64  
 7   price           505 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 31.7+ KB


In [7]:
# Teste de Expectativa (com falha)
test_df.expect_column_values_to_not_be_null('lot_size')

{
  "success": false,
  "result": {
    "element_count": 505,
    "unexpected_count": 77,
    "unexpected_percent": 15.247524752475247,
    "unexpected_percent_total": 15.247524752475247,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Teste de Tipo
test_df.expect_column_values_to_be_in_type_list('baths', ['int'])

test_df.expect_column_values_to_be_in_type_list('baths', ['int', 'float'])

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
# Maximum price is within a specific interval
test_df.expect_column_max_to_be_between(
    column='beds',
    min_value=3,
    max_value=7)

{
  "success": false,
  "result": {
    "observed_value": 9,
    "element_count": 505,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## DATA DOCS

In [13]:
type(test_df)

great_expectations.dataset.pandas_dataset.PandasDataset

In [23]:
test_df.get_expectation_suite()

{
  "expectation_suite_name": "default",
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "price"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_in_type_list",
      "kwargs": {
        "column": "baths",
        "type_list": [
          "int",
          "float"
        ]
      },
      "meta": {}
    }
  ],
  "data_asset_type": "Dataset",
  "meta": {
    "great_expectations_version": "0.17.23"
  }
}

In [24]:
test_df.get_expectation_suite().to_json_dict()

{'expectation_suite_name': 'default',
 'ge_cloud_id': None,
 'expectations': [{'expectation_type': 'expect_column_values_to_not_be_null',
   'kwargs': {'column': 'price'},
   'meta': {}},
  {'expectation_type': 'expect_column_values_to_be_in_type_list',
   'kwargs': {'column': 'baths', 'type_list': ['int', 'float']},
   'meta': {}}],
 'data_asset_type': 'Dataset',
 'meta': {'great_expectations_version': '0.17.23'}}