## Imports

In [None]:
# Snowflake connector
from snowflake import connector
#from snowflake.ml.utils import connection_params

# Snowpark for Python
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import Variant
from snowflake.snowpark.version import VERSION
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import *

# Snowpark ML
from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import StandardScaler, OrdinalEncoder
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml import version
mlversion = version.VERSION
from snowflake.ml.registry import Registry

#Feature Store
from snowflake.ml.feature_store import FeatureStore, CreationMode, Entity, FeatureView

# Misc
import pandas as pd
import json
import logging 
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)

import sys
print(sys.version) ##Last run used Python 3.11

## Creating Date feature store

In [None]:

from snowflake.ml.feature_store import FeatureStore, CreationMode, Entity, FeatureView

from snowflake.snowpark.functions import col, dayofmonth, dayofweek, month, weekofyear, quarter, year, last_day, when, lit
import snowflake.snowpark as snowpark
from snowflake.snowpark import Session
# Generate the DataFrame
session = get_active_session()


fs = FeatureStore(
    session=session,
    database="RETAIL_DEMO",
    name="FEATURE_STORE_MLDEMO",
    default_warehouse="", # edit
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST,
)

entity = Entity(
    name="Date_features",
    join_keys=["DATE"],
)
fs.register_entity(entity)

#Show the entities
fs.list_entities().show()

df = session.range(2192).select(
    (col("id") + lit(1)).cast("int").alias("SEQ4")
).select(
    (lit("2020-01-01").cast("date") + col("SEQ4")).alias("DATE")
).select(
    col("DATE"),
    dayofmonth(col("DATE")).alias("day_of_month"),
    dayofweek(col("DATE")).alias("day_of_week"),
    month(col("DATE")).alias("month_number"),
    weekofyear(col("DATE")).alias("week_of_year"),
    quarter(col("DATE")).alias("quarter_number"),
    year(col("DATE")).alias("year_number"),
    when((dayofweek(col("DATE")) == 6) | (dayofweek(col("DATE")) == 0), 1).otherwise(0).alias("IS_WEEKEND"),
    when(col("DATE") == last_day(col("DATE")), 1).otherwise(0).alias("is_last_day")
)

date_fv = = FeatureView(
    name="DateFeatures",
    entities=[entity],
    feature_df= df,
    timestamp_col="DATE",
)

## Feature Engineering

In [None]:
from snowflake.snowpark.functions import col, to_timestamp, dayofweek, month,sum, listagg, lag
from snowflake.snowpark import Window

df = df.with_column('DATE', to_timestamp(col('DATE'), 'MM/DD/YYYY'))

# Add a new column for the day of the week
# The day of week is represented as an integer, with 0 = Sunday, 1 = Monday, ..., 6 = Saturday
df = df.with_column('DAY_OF_WEEK', dayofweek(col('DATE')))


# Add a new column for the month
df = df.with_column('MONTH', month(col('DATE')))

# Group by DATE, DAY_OF_WEEK, and MONTH, then aggregate
total_riders = df.group_by('DATE','DAY_OF_WEEK','MONTH').agg(
    F.listagg('DAYTYPE', is_distinct=True).alias('DAYTYPE'),
    F.sum('RIDES').alias('TOTAL_RIDERS')
).order_by('DATE')
'''
Adding Lags
'''
#Define a window specification
window_spec = Window.order_by('DATE')

# Add a lagged column for total ridership of the previous day
total_riders = total_riders.with_column('PREV_DAY_RIDERS', lag(col('TOTAL_RIDERS'), 1).over(window_spec))

# Show the resulting dataframe
print (total_riders.count())
print (total_riders.show())

create aggregated view of data as a feature store

In [None]:
agg_fv = FeatureView(
    name="AggBusData",
    entities=[entity],
    feature_df=total_riders,
    timestamp_col="DATE",
)

agg_fv = fs.register_feature_view(agg_fv, version="1", overwrite=True)

# Show our newly created Feature View and display as Pandas DataFrame
fs.list_feature_views().to_pandas()

## Create train and test sets

In [None]:
# Create a date range between 2017 and 2019
date_range = pd.date_range(start='01/01/2013', end='12/31/2019')
date_column = date_range.strftime('%m/%d/%Y')
df = pd.DataFrame(date_column, columns=['DATE'])
spine_df = session.create_dataframe(df)

training_set = fs.generate_training_set(
    spine_df=spine_df,
    features=[agg_fv])


## Model training

In [None]:
 ## Distributed Preprocessing - 25X to 50X faster
numeric_features = ['DAY_OF_WEEK','MONTH','PREV_DAY_RIDERS','MINIMUM_TEMPERATURE','MAXIMUM_TEMPERATURE','PRECIPITATION']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_cols = ['DAYTYPE']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-99999))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_cols)
        ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', XGBRegressor())])

 ## Distributed HyperParameter Optimization
hyper_param = dict(
        model__max_depth=[2,4],
        model__learning_rate=[0.1,0.3],
    )

xg_model = GridSearchCV(
    estimator=pipeline,
    param_grid=hyper_param,
    #cv=5,
    input_cols=numeric_features + categorical_cols,
    label_cols=['TOTAL_RIDERS'],
    output_cols=["TOTAL_RIDERS_FORECAST"],
)

# Fit and Score
xg_model.fit(train)
##Takes 25 seconds

testpreds = xg_model.predict(test)
print('MSE:', mean_absolute_error(df=testpreds, y_true_col_names='TOTAL_RIDERS', y_pred_col_names='"TOTAL_RIDERS_FORECAST"'))
