# Introduction to Snowpark for Pythom

A simple demo of how Snowpark for Python can be used.

You need to load the Campaign spend data, see 00_load_demo_data notebook, and have creds.json file with your credetials.

In [None]:
# Snowpark
import snowflake.snowpark as S
from snowflake.snowpark import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark import Window

# Print the version of Snowpark we are using
print(f"Using Snowpark: {S.__version__}")

In [None]:
import json

# Make sure we do not get line breaks when doing show on wide dataframes
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

import seaborn as sns
import matplotlib.pyplot as plt
import sqlparse

Create a connection to Snowflake, Snowpark supports the following authentification methods:
* Username and password
* externalbrowser (Okta, ADFS, or any other SAML 2.0-compliant identity provider (IdP))
* oauth
* Key pair

This example is using a JSON file with the following structure
```
{
    "account":"MY SNOWFLAKE ACCOUNT",
    "user": "MY USER",
    "password":"MY PASSWORD",
    "role":"MY ROLE",
    "warehouse":"MY WH",
    "database":"MY DB",
    "schema":"MY SCHEMA"
}

```

In [None]:
with open('../creds.json') as f:
    connection_parameters = json.load(f)

In [None]:
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

In [None]:
connect_schema = session.get_fully_qualified_current_schema()
connect_role = session.get_current_role()
connect_wh = session.get_current_warehouse()

print(f"Current schema: {connect_schema}")
print(f"Current role: {connect_role}")
print(f"Current warehouse: {connect_wh}")

The parameters provided for creating a session sets the context ie database, schema, viritual warehouse and role. This can be changed using the **use_** function on the session object

Chaning the active schema to INFORMATION_SCHEMA

In [None]:
session.use_schema("INFORMATION_SCHEMA")
session.get_fully_qualified_current_schema()

We can write SQL using **sql** function on the session object, if we want the SQL to execute on Sowflake we need to use a action method like **show()** or **collect()**

In [None]:
session.sql("SHOW WAREHOUSES").show()

In [None]:
session.use_warehouse("< A NAME FROM THE OUTPUT OF ABOVE >")
session.get_current_warehouse()

Set schema and WH to the ones we defined in our connection

In [None]:
session.use_warehouse(f"{connect_wh}")
session.use_schema(f"{connect_schema}")
print(f"Current schema: {session.get_fully_qualified_current_schema()}, current warehouse:  {session.get_current_warehouse()}")

Define a Snowpark Dataframe based on a exsiting table/view in Snowflake

In [None]:
df_campaign_spend = session.table("CAMPAIGN_SPEND")

In [None]:
df_campaign_spend.queries

In [None]:
df_campaign_spend.show()

Aggregating a DataFrame

In [None]:
df_spend_yearly = df_campaign_spend.group_by(F.year("DATE"), "CHANNEL").sum("TOTAL_COST").sort("YEAR(DATE)")


Using the **sqlparse** libary to generate nicer print of the dataframe sql

In [None]:
print(sqlparse.format(df_spend_yearly.queries['queries'][0], reindent=True))

In [None]:
df_spend_yearly.show()

If we would like to plot the data we need to use a third-party library like seaborn that supports Pandas Dataframe as input.
A Pandas dataframe can be created with SNowpark using the **to_pandas** function.

In [None]:
pd_data = df_spend_yearly.to_pandas()
fig, ax = plt.subplots(figsize=(16, 5))
g = sns.barplot(
    data=pd_data,
    x="YEAR(DATE)", y="SUM(TOTAL_COST)", hue="CHANNEL", ax=ax
)
plt.show()

Of course can multiple agregation be done at the same time

In [None]:
df_campaign_spend.group_by(F.year("DATE"), "CHANNEL").agg([F.sum("TOTAL_COST").as_("TOTAL_COST"),
                                                           F.avg("TOTAL_COST").as_("AVG_COST")]).show()

In [None]:
df_campaign_spend.group_by(F.year("DATE"), "CHANNEL").agg([F.sum("TOTAL_COST").as_("TOTAL_COST"), 
                                                           F.avg("TOTAL_COST").as_("AVG_COST")])\
                    .filter(F.col("AVG_COST") > 840.7).show()


To summarise a column for the whole table

In [None]:
df_campaign_spend.select(F.sum("TOTAL_COST").as_("TOTAL_COST")).show()

We can by using Window functions calculate things like running sum

In [None]:
running_window = Window.orderBy(F.col("DATE")).rows_between(Window.UNBOUNDED_PRECEDING, Window.CURRENT_ROW)

df_cs_running = df_campaign_spend.with_column("RUNNING_SUM", F.sum("TOTAL_COST").over(running_window))
df_cs_running.show()

In [None]:
df_cs_running.queries

Or using lag to get values from rows before the current

In [None]:
df_campaign_spend.with_column("PREVIOUS_MONTH_TOTAL_COST", F.lag(F.col("TOTAL_COST"), 1).over(Window.orderBy(F.col("DATE"))))\
                .show()

Cross database joins works the same as with SQL in Snowflake, even with data from the Snowflake Marketplace!

Before running below you need to go to Snowsight->Marketplace
* Search for "Global Weather & Climate Data for BI"
* Click on get
* Click options
* Set Database name: WEATHER and an at least the PUBLIC role 
* Click GET and Done

In [None]:
df_weather = session.table("WEATHER.STANDARD_TILE.HISTORY_DAY").filter(F.col("POSTAL_CODE") == "30170")
df_weather.show()

Add a new column that converts AVG_TEMPERATURE_AIR_2M_F to Celcius 

In [None]:
df_weather_new_col = df_weather.with_column("AVG_TEMPERATURE_AIR_2M_C", F.round((F.col("AVG_TEMPERATURE_AIR_2M_F") - F.lit(32)) *  F.lit(5)/F.lit(9), 2))
df_weather_new_col.show()

Aggregate by day, in some cases there might be multiple reads per day

In [None]:
df_weather_day = df_weather_new_col.group_by(F.col("DATE_VALID_STD")).agg(F.avg(F.col("AVG_TEMPERATURE_AIR_2M_C")).as_("AVG_TEMPERATURE"))
df_weather_day.show()

Join aggregated weather data with campaign spend data

In [None]:
df_campaign_weather =  df_campaign_spend.join(df_weather_day, F.col("DATE") == F.col("DATE_VALID_STD"))
df_campaign_weather.show()

In [None]:
print(sqlparse.format(df_campaign_weather.queries['queries'][0], reindent=True))

We can add new columns to a dataframe that returns diffrent values based on conditions

In [None]:
df_new_col = df_campaign_weather.with_column("IS_IT_COLD", F.iff(F.col("AVG_TEMPERATURE") < 0, "Brrrr!", "Not really"))
df_new_col.show()

Every thing we have done is only logical changes and are represented by a SQL

In [None]:
print(sqlparse.format(df_new_col.queries['queries'][0], reindent=True))

We can do more complex conditions using **when**

In [None]:
df_check = df_new_col.group_by("CAMPAIGN", "CHANNEL").agg(F.max("DATE").as_("LAST_DAY"))\
            .with_column("LAST_PERIOD", 
                         F.when(F.datediff('month', F.col("LAST_DAY"), F.current_date()) <= 5, 'L5')\
                        .when((F.datediff('month', F.col("LAST_DAY"), F.current_date()) > 5) & (F.datediff('month', F.col("LAST_DAY"), F.current_date()) <= 10), 'L10')\
                        .otherwise("LB"))

df_check.filter(F.col("CAMPAIGN") == F.lit('spring_break')).show()

Pulling back the value of a specific column to do local checks

In [None]:
check_val = df_check.filter((F.col("CAMPAIGN") == F.lit('spring_break')) & (F.col("CHANNEL") == F.lit('social_media'))).select(F.col("LAST_PERIOD")).collect()[0][0]
if check_val == 'L5':
    print(f'ok, we got {check_val}')
elif check_val == 'L10':
    print(f'Not so ok, we got {check_val}')
elif check_val == 'LB':
    print(f'This is bad, we got {check_val}')

If we want to save the data we can use **save_as_table**, if we just want to apply the logic on data using SQL we can use **create_or_replace_view**

In [None]:
df_new_col.write.mode("overwrite").save_as_table("my_demo_table")

In [None]:
df_new_tbl = session.table("my_demo_table")
df_new_tbl.show()

In [None]:
df_new_tbl.queries['queries'][0]

Tables can be updated using Snowpark, with or without a condition (condition can be based on another dataframe)

In [None]:
df_new_tbl.update({"IS_IT_COLD": "Too hot!"}, df_new_tbl["AVG_TEMPERATURE"] > 25)

In [None]:
df_new_tbl.filter(df_new_tbl["AVG_TEMPERATURE"] > 24.5).show()

In [None]:
df_new_tbl.queries

Rows can also be deleted, with and without conditions (condition can be based on another dataframe)

In [None]:
df_new_tbl.count()

In [None]:
df_new_tbl.delete(df_new_tbl["DATE"] < "2013-12-01")

In [None]:
df_new_tbl.count()

Wrap some of the logic up in a Stored Procedure in Snowflake

In [None]:

def my_py_pipline(snf_session: Session, input_tbl:str, output_tbl:str) -> str:
    # Create a Snowpark DataFrame for input_tbl
    df_input = snf_session.table(input_tbl)
    
    # Get the weather data
    df_w = snf_session.table("WEATHER.STANDARD_TILE.HISTORY_DAY").filter(F.col("POSTAL_CODE") == "30170")
    # Create the new column
    df_w_new_col = df_w.with_column("AVG_TEMPERATURE_AIR_2M_C", F.round((F.col("AVG_TEMPERATURE_AIR_2M_F") - F.lit(32)) *  F.lit(5)/F.lit(9), 2))
    # Aggregate weather data to one reading/day
    df_w_d = df_w_new_col.group_by(F.col("DATE_VALID_STD")).agg(F.avg(F.col("AVG_TEMPERATURE_AIR_2M_C")).as_("AVG_TEMPERATURE"))
    # Join with input_tbl
    df_input_weather =  df_input.join(df_w_d, F.col("DATE") == F.col("DATE_VALID_STD"))
    # The new IS_COLD column including the addtional Too hot check
    df_output = df_input_weather.with_column("IS_IT_COLD", F.when(F.col("AVG_TEMPERATURE") < 0, "Brrrr!").when(F.col("AVG_TEMPERATURE") > 25, "Too hot!").otherwise( "Not really"))
    
    # Save it to the output_tbl, overwrite will remove existing data
    df_output.write.mode("overwrite").save_as_table(output_tbl)
    
    return f"Created {output_tbl} using data from {input_tbl}"


Since we are creating a premanent Stored Procedure from Snowpark API we need a stage for stroing the generated bytecode

In [None]:
stage_name = "ST_DEMO_101"
# Create a internal staging area for uploading the source file
session.sql(f"CREATE or replace STAGE {stage_name}").collect()

Create the Stored Procedure in Snowflake

In [None]:
session.clear_imports()
session.clear_packages()
session.add_packages('snowflake-snowpark-python')
sp_py_pipeline = F.sproc(func=my_py_pipline, name="sp_py_pipeline", is_permanent = True
                         , replace= True, stage_location = stage_name, session=session)

In [None]:
sp_py_pipeline(session, 'CAMPAIGN_SPEND', 'CAMPAIGN_SPEND_TRANSFORMED')

In [None]:
session.table("CAMPAIGN_SPEND_TRANSFORMED").show()

### Loading data from files into Snowflake using Snowpark

Files can be loaded into tables in Snowflake using Snowpark by primary two ways
* Load the file into a Pandas Dataframe and use **write_pandas** to load it to a table
* Upload the file to a stage (external & internal) and load it to Snowflake

Loading a file using a stage.

In [None]:
data_path = "../data/"

# Upload the source file to the stage
putResult = session.file.put(f"{data_path}fraud_transactions.csv", f"@{stage_name}", auto_compress=True, overwrite=True)

putResult

In order to load a CSV file a schema needs to be defined

In [None]:
# user_schema is used to read from CSV files. For other files it's not needed.
dfCustTrxFraudSchema = T.StructType(
    [
        T.StructField("TRANSACTION_ID", T.IntegerType()),
        T.StructField("TX_DATETIME", T.TimestampType()),
        T.StructField("CUSTOMER_ID", T.IntegerType()),
        T.StructField("TERMINAL_ID", T.IntegerType()),
        T.StructField("TX_AMOUNT", T.FloatType()),
        T.StructField("TX_TIME_SECONDS", T.IntegerType()),
        T.StructField("TX_TIME_DAYS", T.IntegerType()),
        T.StructField("TX_FRAUD", T.IntegerType()),
        T.StructField("TX_FRAUD_SCENARIO", T.IntegerType())
    ]
)

Using the reader to create a dataframe that reads the file on stage using the above schema

In [None]:
# Crete a reader
dfReader = session.read.schema(dfCustTrxFraudSchema)

# Get the data into the data frame
dfCustTrxFraudRd = dfReader.option("field_delimiter", ",").csv(f"@{stage_name}/fraud_transactions.csv.gz")

In [None]:
dfCustTrxFraudRd.show()

In [None]:
for query in dfCustTrxFraudRd.queries['queries']:
    print(sqlparse.format(query, reindent=True))

To save the data into a table **copy_into_table** or **save_as_table** can be used.

In [None]:
session.sql("DROP TABLE IF EXISTS copied_into_table").collect()
copied_into_result = dfCustTrxFraudRd.copy_into_table("copied_into_table")

In [None]:
copied_into_result

In [None]:
session.table("copied_into_table").show()

When loading JSON data we do not need a schema since we canload it as-is into a Variant column.

In [None]:
session.file.put(f"{data_path}nutrition_tweets.json", f"@{stage_name}", auto_compress=True, overwrite=True)

In [None]:
df_json_stage = session.read.json(f"@{stage_name}/nutrition_tweets.json")
df_json_stage.show(1)

We need to create the table first in order to use copy_into_table (for CSV the table is created aytomatically if needed)

In [None]:
session.sql("CREATE OR REPLACE TABLE json_table (RAW VARIANT)").collect()
df_json_stage.copy_into_table("json_table",  target_columns=["RAW"], 
                              format_type_options= {"STRIP_OUTER_ARRAY": True})

In [None]:
df_json = session.table("json_table")
df_json.show(1)

To select the id and hashtags

In [None]:
df_json.select(F.to_varchar(df_json["RAW"]["user"]["id"]).as_("USER_ID"),
               (df_json["RAW"]["entities"]["hashtags"]).as_("HASHTAG_TEXT")).show(3)

To return one row for each hashtag

In [None]:
df_parsed_json = df_json.join_table_function("flatten", df_json["RAW"]["entities"]["hashtags"])\
        .select(F.to_varchar(df_json["RAW"]["user"]["id"]).as_("USER_ID"),
               F.to_varchar(F.col("VALUE")["text"]).as_("HASHTAG_TEXT"))

In [None]:
df_parsed_json.show()

In [None]:
session.close()