# `snopy` - data ingestion from an external stage (AWS)

## Imports

In [1]:
import os
import pandas as pd

from snopy import snopy_connect
from snowflake.connector.errors import ProgrammingError

## Credentials

In [2]:
sf_username=os.environ['SNOWFLAKE_USER']
sf_password=os.environ['SNOWFLAKE_PASSWORD']
sf_account=os.environ['SNOWFLAKE_ACCOUNT']

## Connecting to your Snowflake account

In [3]:
sc = snopy_connect(
    username=sf_username,
    password=sf_password,
    account=sf_account
)

## Environment setup

In [4]:
sc.role.use('ACCOUNTADMIN', silent=True)
sc.warehouse.use('COMPUTE_WH', silent=True)
sc.database.create('SNOPY', or_replace=True, silent=True)
sc.database.use('SNOPY', silent=True)
sc.schema.create('INGESTION_SCHEMA', or_replace=True, silent=True)
sc.schema.use('INGESTION_SCHEMA', silent=True)

In [5]:
sc.get_environment()

{'role': 'ACCOUNTADMIN',
 'database': 'SNOPY',
 'schema': 'INGESTION_SCHEMA',
 'warehouse': 'COMPUTE_WH'}

## Creating file format

In [6]:
results_ff = sc.file_format.create(
    'my_json_format', 
    'JSON',
    or_replace=True
)

results_ff

{'results': [('File format MY_JSON_FORMAT successfully created.',)],
 'description': [ResultMetadata(name='status', type_code=2, display_size=None, internal_size=16777216, precision=None, scale=None, is_nullable=True)],
 'statement': 'CREATE OR REPLACE FILE FORMAT my_json_format TYPE = JSON'}

## Creating external stage for public data

In [7]:
sc.stage.create(
    stage_name='external_aws_stage_public',
    or_replace=True,
    file_format_name='my_json_format',
    url="'s3://amazon-reviews-ml/json/dev/dataset_en_dev.json'"
)

{'results': [('Stage area EXTERNAL_AWS_STAGE_PUBLIC successfully created.',)],
 'description': [ResultMetadata(name='status', type_code=2, display_size=None, internal_size=16777216, precision=None, scale=None, is_nullable=True)],
 'statement': "CREATE OR REPLACE STAGE external_aws_stage_public FILE_FORMAT = my_json_format url = 's3://amazon-reviews-ml/json/dev/dataset_en_dev.json'"}

## Creating a table

In [8]:
query_create_table = """
    CREATE OR REPLACE TABLE reviews_dev_public (
      language VARCHAR
      , product_category VARCHAR
      , product_id VARCHAR
      , review_body VARCHAR
      , review_id VARCHAR
      , review_title VARCHAR
      , reviewer_id VARCHAR
      , stars INT
    );
"""

# No worries, Table API will be available soon too!
sc.execute(query_create_table)['results']

[('Table REVIEWS_DEV_PUBLIC successfully created.',)]

## Loading data from public external stage to a table

As we're loading data in JSON format, transformations are required to not put everything into a single VARIANT type column (for more, [see the documentation](https://docs.snowflake.com/en/sql-reference/data-types-semistructured.html)).

In [9]:
transformation_statement = """
    SELECT
        $1:language::varchar AS language
        , $1:product_category::varchar AS product_category
        , $1:product_id::varchar AS product_id
        , $1:review_body::varchar AS review_body
        , $1:review_id::varchar AS review_id
        , $1:review_title::varchar AS review_title
        , $1:reviewer_id::varchar AS reviewer_id
        , $1:stars::int AS stars
    FROM @external_aws_stage_public
"""

sc.copy_into(
    table_name='reviews_dev_public',
    source_stage=transformation_statement,
    silent=True
)

## Data querying straight to Pandas DataFrame

In [10]:
reviews_dev_public = sc.query_pd('SELECT * FROM reviews_dev_public')
reviews_dev_public.head()

Unnamed: 0,LANGUAGE,PRODUCT_CATEGORY,PRODUCT_ID,REVIEW_BODY,REVIEW_ID,REVIEW_TITLE,REVIEWER_ID,STARS
0,en,baby_product,product_en_0878845,Pathetic design of the caps. Very impractical ...,en_0968227,Not worth the price and very bad cap design,reviewer_en_0987470,1
1,en,shoes,product_en_0004522,"Shoes were purchased on March 6, 2019. My wife...",en_0830781,Garbage!,reviewer_en_0731158,1
2,en,office_product,product_en_0060687,It's taken me 1 whole year to set this thing u...,en_0277954,I do not recommend this printer,reviewer_en_0793876,1
3,en,office_product,product_en_0311791,Each cartridge printed once. Both dried up in ...,en_0316499,Don't purchase these refurbished cartridges!,reviewer_en_0837288,1
4,en,baby_product,product_en_0472877,No light hard to see,en_0320665,Not worth,reviewer_en_0878169,1


---

## Creating storage integration for AWS S3 bucket connection with Snowflake

For more about connecting your private cloud storage with Snowflake see below documentation pages:

- [Loading data to Snowflake from AWS S3](https://docs.snowflake.com/en/user-guide/data-load-s3.html)
- [Loading data to Snowflake from Google Cloud Storage](https://docs.snowflake.com/en/user-guide/data-load-gcs.html)
- [Loading data to Snowflake from Azure](https://docs.snowflake.com/en/user-guide/data-load-azure.html)
- [External stage create command - necessary parameters](https://docs.snowflake.com/en/sql-reference/sql/create-stage.html#external-stage-parameters-externalstageparams)

Bear in mind that **only users with ACCOUNTADMIN role** selected can create storage integration objects.

Below cells follow `Option 1` from the [list of available approaches](https://docs.snowflake.com/en/user-guide/data-load-s3-config.html). It's highly recommended to do it this way, thus let's follow the advice.

Some of the parameters are dummy, for security reasons, although cells were executed with proper values. If you follow the instructions in Snowflake documentation, you shouldn't have any issues modifying them.

In [None]:
sc.storage_integration.create(
    storage_integration_name='storage_integration_aws_snopy_sf_data',
    storage_provider='S3',
    storage_allowed_locations=['s3://snopy-snowflake-data/'],
    STORAGE_AWS_ROLE_ARN="'arn:aws:iam::112233445566:role/SnopyDummyRole'",
    or_replace=True,
    silent=True
)

## Setting up proper IAM access

Policy code is taken directly from [Snowflake documentation](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration.html).

In [None]:
# Remove `head()` to see all necessary values
# pd.set_option('display.max_colwidth', None) # useful to copy-paste parameters
sc.query_pd('DESC INTEGRATION storage_integration_aws_snopy_sf_data;').iloc[:, [0, 2]].head(2)

According to values returned by storage integration description, please go to AWS console and alter your IAM Role's Trust Policy.

When this is done, you can proceed forward.

## Creating external stage for private data

In [None]:
sc.stage.create(
    stage_name='external_aws_stage_private',
    STORAGE_INTEGRATION='storage_integration_aws_snopy_sf_data',
    url="'s3://snopy-snowflake-data/'",
    file_format_name='my_json_format',
    or_replace=True
)

## Listing external stage objects

In [None]:
sc.query_pd('LIST @external_aws_stage_private')

## Creating a table

In [None]:
query_create_table = """
    CREATE OR REPLACE TABLE noaa_climatology_ext (
        id VARCHAR
        , date DATETIME
        , element VARCHAR
        , data_value INT
        , m_flag VARCHAR
        , q_flag VARCHAR
        , s_flag VARCHAR
        , obstime TIMESTAMP
    );
"""

sc.execute(query_create_table)['results']

## Loading data from private external stage to a table

In [None]:
sc.copy_into(
    table_name='noaa_climatology_ext',
    source_stage='@external_aws_stage_private',
    silent=True
)

## Querying data straight to Pandas DataFrame

In [None]:
pd.set_option('display.max_colwidth', 100)
reviews_dev_public = sc.query_pd('SELECT * FROM noaa_climatology_ext')
reviews_dev_public.head()

## Dropping storage integration

In [None]:
sc.storage_integration.drop('storage_integration_aws_snopy_sf_data')

## Closing the connection

In [None]:
sc.close_connection()