In [4]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf, avg, col,lit,call_udf,countDistinct,sproc,udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType, BooleanType
import pandas as pd
from configs.config import snowflake_conn_prop_local as snowflake_conn_prop
import sys
import json
import platform
import os,requests
from pathlib import Path
import glob
from src.DataValidationContext import *


from snowflake.snowpark import version
print(version.VERSION)
session = Session.builder.configs(snowflake_conn_prop).create()

print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

(0, 7, 0)
[Row(CURRENT_WAREHOUSE()='CLUSTER1', CURRENT_DATABASE()='NYCTAXI', CURRENT_SCHEMA()='TAXI')]


### Initialize

- This has been tested with GE 15.14. There was some issue with 15.17 where its trying to look for the package ipywidgets which is found in Snowflake anaconda channel but not found in information_schema.packages 


In [5]:

PROJECT_HOME_DIR = '.'
LOCAL_TEMP_DIR = os.path.join(PROJECT_HOME_DIR, 'temp') 
LOCAL_LIB_DIR = os.path.join(LOCAL_TEMP_DIR, 'libs')
LOCAL_TARFile_DIR = os.path.join(LOCAL_TEMP_DIR, 'tarfiles')
LIB_URLS = [
    #'https://files.pythonhosted.org/packages/9f/57/1539d783553f3d67cea1b55d7fe494373c5c0c9af689d4c0e0c2d3197739/great_expectations-0.15.17-py3-none-any.whl'
    'https://files.pythonhosted.org/packages/8e/9d/cecb12289f7967b15facf550a0bbb9c1e910968c3a61b91fd8cdb80aeb3c/great_expectations-0.15.14.tar.gz'
    
]

for lib_url in LIB_URLS:
    # get the file name, from the url
    splits = lib_url.split('/')
    tot_splits = len(splits)
    target_file = splits[-1]
    
    local_lib_fl = f'{LOCAL_TARFile_DIR}/{target_file}'
    print(local_lib_fl)

    # Create a local directory for TAR and extracting tar..
    Path(LOCAL_TARFile_DIR).mkdir(parents=True, exist_ok=True)
    print(f'Create local dir: {LOCAL_TARFile_DIR}')

    Path(LOCAL_LIB_DIR).mkdir(parents=True, exist_ok=True)
    print(f'Create local dir: {LOCAL_LIB_DIR}')

    print(f'Downloading library from PyPI to {LOCAL_TARFile_DIR} ...')
    with open(local_lib_fl, "wb") as f:
        r = requests.get(lib_url)
        f.write(r.content)

        
# Extract GE tar file

import tarfile
file = tarfile.open(local_lib_fl)
print(f'Started Extracting GE tar file to {LOCAL_TARFile_DIR} ...')
file.extractall(f'{LOCAL_LIB_DIR}/ge')
file.close()
print(f'Done extracting GE tar file to {LOCAL_TARFile_DIR} ...')
                                    

./temp/tarfiles/great_expectations-0.15.14.tar.gz
Create local dir: ./temp/tarfiles
Create local dir: ./temp/libs
Downloading library from PyPI to ./temp/tarfiles ...
Started Extracting GE tar file to ./temp/tarfiles ...
Done extracting GE tar file to ./temp/tarfiles ...


In [6]:
# Getting the path for the great_expectation folder after the tar file is extracted.

import glob
ge_import_path=''
for result in glob.iglob('./temp/libs/ge/great_expectations*'):
    ge_import_path=result+'/great_expectations'
print(ge_import_path)


./temp/libs/ge/great_expectations-0.15.14/great_expectations


### Creating Python Stored Procedure

In [21]:
from configs.config import snowflake_conn_prop_local as snowflake_conn_prop
from src.DataValidationContext import GEDataValidationContext
from src.BatchRequest import getBatchRequest 
from src.Expectations import  createExpectationSuite, createExpectations
from src.RunLoadExpectations import runExpectaionValidation,loadValidationToDB

import json
import os


session.sql("create or replace stage phani_greatexpectation").collect()
session.clear_packages()
session.add_packages('pandas','pycryptodomex','boto3','tzlocal','tqdm','requests','ruamel.yaml','ipython','jsonpatch','mistune','jinja2','jsonschema','scipy','altair','Click','colorama','cryptography','snowflake-snowpark-python','sqlalchemy','chardet','asn1crypto')
session.clear_imports()
# session.add_import('great_expectations')
session.add_import(ge_import_path)
session.add_import('src')
session.add_import('configs')

@sproc(session=session,name="usp_generateGEValidationResults", replace=True, return_type=StringType(),input_types=[StringType(),StringType(),StringType(),StringType()], is_permanent=True, stage_location='@phani_greatexpectation/ge_AllLibs')
def generateGEValidationResults(session: Session,datasourcename:str,expecationsuitename:str,checkpointname:str,sftablename) -> str:
    
    from pathlib import Path
    import os ,sys ,json ,tarfile
    
    #Creating GE context inside code
    ge=GEDataValidationContext(datasourcename)
    context=ge.getContext()
    
    # Creating the Pandas DataFrame from Snowpark DF
    pd_df=session.sql("select top 2000 * from TAXI_TRIPS_MAT_VIEW").to_pandas()
    
    # Getting the batch request used while creating and running validation on expectations
    local_batch_request=getBatchRequest(context,datasourcename,pd_df)
    
    #Creating GE expectation Suite
    createExpectationSuite(context,expecationsuitename)
    
    #Creating GE expecations
    createExpectations(context,expecationsuitename,local_batch_request,pd_df)
    
    #Running GE validation 
    res=runExpectaionValidation(context,"checkpointname",local_batch_request,expecationsuitename,datasourcename)
    
    #Loading validation result to Snowflake table. Using append option while writing the data to the table
    loadValidationToDB(session,res,sftablename)
      
    return 'SUCCESS'

    

In [19]:
#Calling SP
session.sql("call usp_generateGEValidationResults('PandasDataSource','TaxiExpecatation','TaxiCheckpoint','TaxiGEValidation')").collect()

[Row(USP_GENERATEGEVALIDATIONRESULTS='SUCCESS')]