# `snopy` - csv data ingestion from a local stage

## Imports

In [1]:
import os
from snopy import snopy_connect
from snowflake.connector.errors import ProgrammingError

## Credentials

In [2]:
sf_username=os.environ['SNOWFLAKE_USER']
sf_password=os.environ['SNOWFLAKE_PASSWORD']
sf_account=os.environ['SNOWFLAKE_ACCOUNT']

## Connecting to your Snowflake account

In [3]:
sc = snopy_connect(
    username=sf_username,
    password=sf_password,
    account=sf_account
)

## Environment setup

In [4]:
sc.database.get_current() is None

True

In [5]:
sc.warehouse.use('COMPUTE_WH')

{'results': [('Statement executed successfully.',)],
 'description': [ResultMetadata(name='status', type_code=2, display_size=None, internal_size=16777216, precision=None, scale=None, is_nullable=True)],
 'statement': 'USE WAREHOUSE COMPUTE_WH'}

In [6]:
sc.role.use('ACCOUNTADMIN', silent=True)

In [7]:
sc.database.create('SNOPY', or_replace=True, silent=True)
sc.database.use('SNOPY', silent=True)
sc.schema.create('INGESTION_SCHEMA', or_replace=True, silent=True)
sc.schema.use('INGESTION_SCHEMA', silent=True)

In [8]:
sc.get_environment()

{'role': 'ACCOUNTADMIN',
 'database': 'SNOPY',
 'schema': 'INGESTION_SCHEMA',
 'warehouse': 'COMPUTE_WH'}

## Creating file format

In [9]:
results_ff = sc.file_format.create(
    'my_csv_format', 
    'CSV', 
    or_replace=True,
    FIELD_DELIMITER="','",
    SKIP_HEADER=1
)

results_ff

{'results': [('File format MY_CSV_FORMAT successfully created.',)],
 'description': [ResultMetadata(name='status', type_code=2, display_size=None, internal_size=16777216, precision=None, scale=None, is_nullable=True)],
 'statement': "CREATE OR REPLACE FILE FORMAT my_csv_format TYPE = CSV FIELD_DELIMITER = ',' SKIP_HEADER = 1"}

## Creating file format with wrong parameter (not applicable for JSON file format, see [docs](https://docs.snowflake.com/en/sql-reference/sql/create-file-format.html))

In [10]:
try:
    sc.file_format.create('my_invalid_csv_format', 'CSV', or_replace=True, ENABLE_OCTAL = "TRUE")
except ProgrammingError as pe:
    print('Something went wrong!')
    print(pe)

Something went wrong!
002135 (42601): SQL compilation error:
Option ENABLE_OCTAL is not valid for file format type CSV.


## Dropping file format

In [11]:
# sc.file_format.drop('my_csv_format', if_exists=True)

## Creating internal stage

In [12]:
sc.stage.create(
    stage_name='my_internal_stage', 
    or_replace=True,
    file_format_name='my_csv_format'
)

{'results': [('Stage area MY_INTERNAL_STAGE successfully created.',)],
 'description': [ResultMetadata(name='status', type_code=2, display_size=None, internal_size=16777216, precision=None, scale=None, is_nullable=True)],
 'statement': 'CREATE OR REPLACE STAGE my_internal_stage FILE_FORMAT = my_csv_format'}

## Putting data on the stage

In [13]:
sc.stage.put(
    filepath='file://data/noaa-ghcn-pds_2021.csv',
    internal_stage_name='@my_internal_stage',
    overwrite=True,
    silent=True
)

## Showing files inside stage

In [14]:
result_list_stage = sc.stage.list('@my_internal_stage')

print(result_list_stage['results'])

[('my_internal_stage/noaa-ghcn-pds_2021.csv.gz', 272, 'b46021717b119761e61ee3d1eb708d50', 'Thu, 15 Sep 2022 15:34:30 GMT')]


## Creating a table

It's possible to do something like `CREATE TABLE AS SELECT ... FROM @stage`, although it's highly not recommended! With that approach we don't keep file loading history (which Snowflake is capable to do and make a lot of use from).

In [15]:
query_create_table = """
    CREATE OR REPLACE TABLE noaa_climatology (
        id VARCHAR
        , date DATETIME
        , element VARCHAR
        , data_value INT
        , m_flag VARCHAR
        , q_flag VARCHAR
        , s_flag VARCHAR
        , obstime TIMESTAMP
    );
"""

# No worries, Table API will be available soon too!
sc.execute(query_create_table)['results']

[('Table NOAA_CLIMATOLOGY successfully created.',)]

## Loading data from internal stage to a table

In [16]:
sc.copy_into(
    table_name='noaa_climatology',
    source_stage='@my_internal_stage',
    silent=True
)

## Running the same command again 

No harm as `COPY INTO` tracks which files have been loaded already

In [17]:
sc.copy_into(
    table_name='noaa_climatology',
    source_stage='@my_internal_stage',
    silent=True
)

## Data querying straight to Pandas DataFrame

In [18]:
data_noaa = sc.query_pd('SELECT * FROM NOAA_CLIMATOLOGY')
data_noaa.head()

Unnamed: 0,ID,DATE,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBSTIME
0,AE000041196,1970-08-22 21:55:01,TMAX,278,,,S,
1,AE000041196,1970-08-22 21:55:01,PRCP,0,D,,S,
2,AE000041196,1970-08-22 21:55:01,TAVG,214,H,,S,
3,AEM00041194,1970-08-22 21:55:01,TMAX,266,,,S,
4,AEM00041194,1970-08-22 21:55:01,TMIN,178,,,S,


## Closing the connection

In [19]:
sc.close_connection()