In [None]:
%pip install snowflake-connector-python
%pip install snowflake-snowpark-python
%pip install python-dotenv

In [None]:
import os
from dotenv import load_dotenv
from snowflake.snowpark import Session
import snowflake.snowpark.functions as f
from snowflake.snowpark.window import Window
from datetime import date
from snowflake.snowpark.types import ArrayType, VariantType 

In [None]:
load_dotenv()

### creating snowpark session

In [None]:

def snowpark_session_create():
    connection_params = {
        "account": os.getenv("SNOWFLAKE_ACCOUNT"),
        "user": os.getenv("SNOWFLAKE_USER"),
        "password": os.getenv("SNOWFLAKE_PASSWORD"),
        "role": os.getenv("SNOWFLAKE_ROLE"),
        "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
        "database": os.getenv("SNOWFLAKE_DATABASE"),
        "schema": os.getenv("SNOWFLAKE_SCHEMA")
    }


    session = Session.builder.configs(connection_params).create()
    return session

In [None]:
demo_session = snowpark_session_create()

In [None]:
df = demo_session.sql("SELECT * FROM DEMO.RAW.STG_CITIES LIMIT ")
df.show()


In [None]:
for row in df.collect():
    print(row)

### point session to the snowflake demo db,schema,table


In [None]:
demo_session.use_database("demo")
demo_session.use_schema("raw")
demo_session.table("stg_cities")

In [None]:
df = df.withColumn("DATE", f.to_date(f.col("DATE")))

In [None]:
print(df.count())
df.show()


In [None]:
df = (
    df.select(
        "CITY",
        "DATE",
        f.col("TAVG").alias("TEMP_AVG"),
        f.col("TMAX").alias("TEMP_MAX"),
        f.col("TMIN").alias("TEMP_MIN"),
        f.col("PRCP").alias("TOTAL_PRECIPITATION"),
        f.col("WSPD").alias("AVG_WIND_SPEED"),
        f.col("WPGT").alias("WIND_PEAK_GUST"),
        f.col("PRES").alias("SEA_LEVEL_AIR_PRESSURE")
    )
    .filter(f.col("DATE").between(date(2020, 6, 1), date(2020, 8, 31)))
    .sort(f.col("DATE").asc())
)




In [None]:
df.show(100)

In [None]:
df = df.na.fill({"TOTAL_PRECIPITATION": 0, "AVG_WIND_SPEED": 0, "WIND_PEAK_GUST": 0, "SEA_LEVEL_AIR_PRESSURE": 0})
df.show(100)


In [None]:
df.show(100)

In [None]:
df_count2020 = df.withColumn("MONTH", f.monthname(f.col("DATE")))\
    .withColumn("YEAR",f.year(f.col("DATE"))) \
    .group_by("CITY", "MONTH","YEAR") \
    .agg(f.avg("TEMP_AVG").alias("MONTHLY_AVG_TEMP"),
        f.max("TEMP_MAX").alias("WARMEST_SUMMER_MONTH"),
        f.min("TEMP_MIN").alias("COLDEST_SUMMER_MONTH"))\
    .sort(f.col("CITY"),f.col("MONTH").desc())

In [None]:
df2 = demo_session.table("stg_cities")
df2 = (
    df2.select(
        "CITY",
        "DATE",
        f.col("TAVG").alias("TEMP_AVG"),
        f.col("TMAX").alias("TEMP_MAX"),
        f.col("TMIN").alias("TEMP_MIN"),
        f.col("PRCP").alias("TOTAL_PRECIPITATION"),
        f.col("WSPD").alias("AVG_WIND_SPEED"),
        f.col("WPGT").alias("WIND_PEAK_GUST"),
        f.col("PRES").alias("SEA_LEVEL_AIR_PRESSURE")
    )
    .filter(f.col("DATE").between(date(2023, 6, 1), date(2023, 8, 31)))
    .sort(f.col("DATE").asc())
)
df = df.na.fill({"TOTAL_PRECIPITATION": 0, "AVG_WIND_SPEED": 0, "WIND_PEAK_GUST": 0, "SEA_LEVEL_AIR_PRESSURE": 0})
df.show(100)

In [None]:
df_count2023 = df2.withColumn("MONTH", f.monthname(f.col("DATE")))\
    .withColumn("YEAR",f.year(f.col("DATE"))) \
    .group_by("CITY", "MONTH","YEAR") \
    .agg(f.avg("TEMP_AVG").alias("MONTHLY_AVG_TEMP"),
         f.max("TEMP_MAX").alias("WARMEST_SUMMER_MONTH"),
         f.min("TEMP_MIN").alias("COLDEST_SUMMER_MONTH"))\
    .sort(f.col("CITY"),f.col("MONTH").desc())     

In [None]:
df_count2023.show(100)

### join tables for comparison

In [None]:
joined_df = df_count2020.join(
    df_count2023, 
    (df_count2020.CITY == df_count2023.CITY) & 
    (df_count2020.MONTH == df_count2023.MONTH) 
).select(
    df_count2020.CITY.alias("CITY_2020"),
    df_count2020.MONTH.alias("MONTH_2020"),
    df_count2020.WARMEST_SUMMER_MONTH.alias("WARMEST_SUMMER_MONTH_2020"),
    df_count2023.CITY.alias("CITY_2023"),
    df_count2023.MONTH.alias("MONTH_2023"),
    df_count2023.WARMEST_SUMMER_MONTH.alias("WARMEST_SUMMER_MONTH_2023")
).sort(f.col("CITY_2020"),f.col("MONTH_2020").desc())

In [None]:
joined_df.show(100)

In [None]:
df_count2020 = df.withColumn("SUMMER_MAX_TEMP",f.col("TEMP_MAX")) \
    .group_by("CITY") \
    .agg(f.max("TEMP_MAX").alias("HIGHEST_SUMMER_TEMP_2020"))\
    .sort(f.col("CITY"))

In [None]:
df_count2020.show(100)

In [None]:
df_count2023 = df2.withColumn("SUMMER_MAX_TEMP",f.col("TEMP_MAX")) \
    .group_by("CITY") \
    .agg(f.max("TEMP_MAX").alias("HIGHEST_SUMMER_TEMP_2023"))\
    .sort(f.col("CITY"))

In [None]:
df_count2023.show()

In [None]:
join_highest_summer = df_count2020.join(
    df_count2023, 
    (df_count2020.CITY == df_count2023.CITY) 
).select(
    df_count2020.CITY.alias("CITY_2020"),
    df_count2020.HIGHEST_SUMMER_TEMP_2020.alias("HIGHEST_SUMMER_TEMP_2020"),
    df_count2023.CITY.alias("CITY_2023"),
    df_count2023.HIGHEST_SUMMER_TEMP_2023.alias("HIGHEST_SUMMER_TEMP_2023")
).sort(f.col("CITY_2020"))

In [None]:
join_highest_summer.show(100)

functions

In [None]:
# a column that puts two totals into an array
#lefr a comment
join_highest_summer = join_highest_summer.withColumn(
    "2020vs2023_highest_summer_temp",
    f.array_construct(
        f.col("HIGHEST_SUMMER_TEMP_2020").cast("int"),
        f.col("HIGHEST_SUMMER_TEMP_2023").cast("int")
    )
).withColumn(
    "COMMENT",
    f.concat(f.lit("Code was created by "), f.current_user())
)

In [None]:
join_highest_summer.show()

In [None]:
join_highest_summer.drop(f.col("HIGHEST_SUMMER_TEMP_2020"),f.col("HIGHEST_SUMMER_TEMP_2023")).show()


close the session

In [None]:
demo_session.close()