<h1 style="text-align:center;font-weight: bold;"><font color = "0077A7">FORMULAS</font></h1><br>


First install all the needed dependencies and packages

In [1]:
import pandas as pd

from atscale.client import Client
from atscale.data_model import DataModel
from atscale.project import Project
from atscale.connection import Connection
from atscale.eda.feature_engineering import *
from atscale.base.enums import Aggs, PandasTableExistsActionType
from atscale.db.connections.snowflake import Snowflake

from sklearn.linear_model import LinearRegression
import joblib
import os
import sys
import snowflake.snowpark as snowpark

import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

from snowflake.snowpark import Session
from snowflake.snowpark.functions import udf, udtf, table_function, call_udf
from snowflake.snowpark.types import FloatType, PandasDataFrameType

from typing import Iterable, Tuple
import json

Connect to AtScale

In [None]:
with open("/permissions") as file:
    permissions = json.load(file)
    
with open("/requirements") as f:
    packages_version = json.load(f)

In [2]:
client = Client(
    server = permissions["atscale_server"],
    organization = permissions["atscale_organization"],
    username = permissions["atscale_username"],
    password = permissions["atscale_password"]
)

In [None]:
client.connect()

In [None]:
project = client.select_project()

In [None]:
data_model = project.select_data_model()

Connect to Snowpark

In [6]:
connection_parameters = {
    "account": permissions["snowflake_account"],
    "user": permissions["snowflake_username"],
    "password": permissions["snowflake_password"],
    "role": permissions["snowflake_role"],
    "warehouse": permissions["snowflake_warehouse"],
    "database": permissions["snowflake_database"],
    "schema": permissions["snowflake_schema"]
}
session = Session.builder.configs(connection_parameters).create()

In [7]:
# create stage
session.sql('create stage if not exists EASYSTAGE').collect()

[Row(status='EASYSTAGE already exists, statement succeeded.')]

# Making a Standard UDF / Stored Procedure

<p>There are two ways to do this
<ol><li>using the <font color = "purple">@udf</font> decorator to both define and register the function</li>
    <li>defining the function first and then registering it after</li>
</ol></p>

In [None]:
##################################################### UDF Formula #1 ####################################################

# @udf(name = <function_name(doesnt need to match)>, session = <session_variable>, replace = True, stage_location = <stage_name>)
# def <function_name>(<input_name>: <input_type>) -> <output_type>:
    
#     <insert function here>
    
#     return <variable with output_type>

In [None]:
##################################################### UDF Formula #2 ####################################################

# def <function_name>(<input_name>: <input_type>) -> <output_type>:
    
#     <insert function here>
    
#     return <variable with output_type>

In [None]:
                                ### ------- Register the UDF Formula ------- ###
    
# <name of function> = <session_variable>.udf.register(
#     func = <name_of_function>,
#     return_type = <snowpark_type_returned>,
#     input_types = <snowpark_type_input>,
#     is_permanent = True,
#     name = <udf_name_to_be_called_by>,
#     replace = True,
#     stage_location = <@stage_name>,
#     imports=[<@stage_name/file_to_import>],
#     packages=[f'<package==version'
#             ,f'<package==version'
#             ,f'<package==version'],
#     session=<session_variable>)

### Call the UDF

In [None]:
#<session_name>.sql('''SELECT <UDF_name>(<inputs>)''' ).show()

<h1 style="text-align:left;font-weight: bold;"><font color = "green">Making a Vectorized UDF</font></h1>

<p>Benefits:</p>
<ul><li>Can be more efficient</li>
    <li>Works on Batches of Rows</li>
    </ul>
    <p>Drawbacks:</p>
<ul><li>Only uses Pandas Series/Dataframes</li> </ul>
    


In [None]:
###Define same way as method #2 for UDFs, but with Pandas Series###

# def <function_name>(<input_name>: pd.Series) -> pd.Series:
    
#     <insert function here>
    
#     return <variable with output_type>

In [None]:
                                ### ------- Register the UDF Formula ------- ###
    
# <name of function> = <session_variable>.udf.register(
#     func = <name_of_function>,
#     return_type = <snowpark_type_returned>,
#     input_types = <snowpark_type_input>,
#     is_permanent = True,
#     name = <udf_name_to_be_called_by>,
#     replace = True,
#     stage_location = <@stage_name>,
#     imports=[<@stage_name/file_to_import>],
#     packages=[f'<package==version'
#             ,f'<package==version'
#             ,f'<package==version'],
#     session=<session_variable>)

### Call the Vectorized UDF

In [None]:
#<dataframe>.select(add_cols(<dataframe>.col("<columm>"))).to_df(<desired_result_col_name>).collect()

<h1 style="text-align:left;font-weight: bold;"><font color = "purple">Making a UDTF</font></h1>

<p>A UDTF allows you to return a table for each input. This becomes powerful when doing multiple operations on a column to make a set of column results.
    <br>
   The creation of UDTFs, however, differs in some ways from the creation of the other two UDFs. To make and register a UDTF you will need to do the following </p>
   <ol>
    <li>Write an <font color = "purple">@udtf</font> decorator (same format as UDF)</li>
    <li>Create class to hold your methods</li>
    <li>Define an <b><font color = "blue">__init__</font></b> method (optional) </li>
    <li>Define a <b><font color = "blue">process</font></b> method to do the work of the UDTF</li>
    <li>Define an <b><font color = "blue">end_partition</font></b> method to process logic of partition after process is done(optional) </li>

In [None]:
##################################################### UDTF Formula #####################################################

# @udtf(name = <function_name(doesnt need to match)>, input_types = <iterable_snowpark_var>, output_schema = <snowpark_var>, session = <session_variable>, replace = True, stage_location = <stage_name>)
# class <class_name>:
#     def __init__(self, <vars>):
#         <self.vars = vars>
        
#     def process(self, <some_vars>):
#         #do main functionality here
#         yield (<result_var1>, <...result_var2>)
    
#     def end_partition(self, <some_vars>):
#         yield (<self.var1>, <...self.vars2>)
    

### Call the UDTF

In [None]:
# session.sql('''select TOP <num_rows> *
# from <table_name> as <var>,
#     table(
#       <udtf_name>(<var>.<col>) OVER (PARTITION BY <num_to_divide_process> ORDER BY <col>))''' ).to_pandas()

<br><br><br><br><br><br>
<h1 style="text-align:center;font-weight: bold;"><font color = "44B5E4">EXAMPLES</font></h1><br>


## UDF

<h6><font color='#009ABE'>Method #1</font></h6>

<h5> Simple Modulus UDF

In [None]:
@udf(name = "modulus", session = session, replace = True, stage_location = "EASYSTAGE")
def modulus(pair: tuple) -> int:
    main = pair[0]
    divisor = pair[1]
    
    return main % divisor

In [None]:
session.sql('''SELECT modulus([6,3])''').show()

<h5>Modulus of column UDF

In [None]:
#Define/Register
@udf(name = "mod_col", session = session, replace = True, stage_location = "EASYSTAGE")
def mod_col(col: list, div: int) -> list:
    return [i % div for i in col]

In [None]:
df = session.create_dataframe([[1,2],[3,4]]).to_df("a","b")

In [None]:
#Call
df.select(mod_col(df.col("a"),2)).to_df("results").collect()

<h6><font color='#009ABE'>Method #2</font></h1>

In [None]:
#Define
def random_names() -> str:
    import random

    choices = ["Betty", "Matt", "Charles", "Owen", "Julia", "Judy"]
    i=random.randint(0, len(choices)-1)
    
    return choices[i]

In [None]:
#Register
random_names = session.udf.register(
    func = random_names,
    return_type = T.StringType(),
    is_permanent = True,
    name = "rand_names",
    replace = True,
    stage_location = "@EASYSTAGE",
    session = session )

In [None]:
#Call
session.sql('''SELECT RAND_NAMES()''' ).show()

## Vectorized UDF

In [None]:
#Define 
def add_cols(series1: pd.Series, series2: pd.Series) -> pd.Series:
    return series1 + series2

In [None]:
#Register 
add_cols_udf = session.udf.register(
                func =add_cols,
               return_type = IntegerType(),
               input_types=[IntegerType()],
                is_permanent = True,
                replace = True,
                name = "add_cols",
                stage_location = "@EASYSTAGE")

In [None]:
#Call
df.select(add_cols(df.col("a"),df.col("b"))).to_df("results").collect()

## UDTF

In [8]:
#Define/Register
@udtf(name = "date_splitter", input_types = [T.DateType()], output_schema = T.StructType([T.StructField("years", T.StringType()),T.StructField("months", T.StringType()), T.StructField("days", T.StringType())]), replace = True, stage_location = "@EASYSTAGE", session = session)
class split_date:
    def __init__(self):
        self.years = 0
        self.months = 0
        self.days = 0

    def process(self, date):
        date = str(date)
        self.years = date[0:4]
        self.months = date[5:7]
        self.days = date[8:]
        yield (self.years, self.months, self.days)

    def end_partition(self) -> Iterable[Tuple[int, int, int]]:
        yield (self.years, self.months, self.days)

In [10]:
#Call
session.sql('''select *
from WALMART_SNOWPARK as t,
    table(
      DATE_SPLITTER(t.date) OVER (PARTITION BY 10 ORDER BY date))''' ).to_pandas()

Unnamed: 0,DATE,DAY_OF_WEEK,MAX_SALES,MAX_UNITS_SOLD,TOTAL_ITEMS,TOTAL_SALES,AVERAGE_SALES,AVERAGE_UNITS_SOLD,ITEM,STATE,...,AVERAGE_SALES_6_MONTH_LAG_SHIFT_1,AVERAGE_SALES_12_MONTH_SUM_SHIFT_1,AVERAGE_SALES_12_MONTH_AVG_SHIFT_1,AVERAGE_SALES_12_MONTH_STDDEV_SHIFT_1,AVERAGE_SALES_12_MONTH_MIN_SHIFT_1,AVERAGE_SALES_12_MONTH_MAX_SHIFT_1,AVERAGE_SALES_12_MONTH_LAG_SHIFT_1,YEARS,MONTHS,DAYS
0,2015-01-01,6.0,4.97,0.0,1.0,4.97,4.97,0,HOUSEHOLD_2_176,WI,...,0.46,19.06,1.588333,1.171369,0.23,3.50,1.96,2015,01,01
1,2015-01-01,6.0,1.96,9.0,1.0,1.96,1.96,9,FOODS_1_004,TX,...,0.48,22.96,1.913333,1.510312,0.23,4.97,0.94,2015,01,01
2,2015-01-01,6.0,4.97,5.0,1.0,4.97,4.97,5,HOUSEHOLD_2_176,TX,...,0.46,18.33,1.527500,1.073161,0.23,3.50,1.96,2015,01,01
3,2015-01-01,6.0,4.97,0.0,1.0,4.97,4.97,0,HOUSEHOLD_2_176,CA,...,0.46,18.93,1.577500,1.174069,0.23,3.50,1.96,2015,01,01
4,2015-01-01,6.0,1.96,2.0,1.0,1.96,1.96,2,FOODS_1_004,CA,...,0.48,23.98,1.998333,1.570187,0.23,4.97,0.98,2015,01,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67075,2016-04-24,2.0,1.97,1.0,1.0,1.97,1.97,1,HOUSEHOLD_2_142,CA,...,1.58,19.98,1.665000,1.278988,0.23,3.97,4.97,2016,04,24
67076,2016-04-24,2.0,4.97,5.0,1.0,4.97,4.97,5,HOUSEHOLD_2_176,CA,...,0.48,19.99,1.665833,1.279200,0.23,3.97,1.96,2016,04,24
67077,2016-04-24,2.0,0.48,1.0,1.0,0.48,0.48,1,HOBBIES_1_178,TX,...,4.97,24.90,2.075000,1.401898,0.23,4.97,0.48,2016,04,24
67078,2016-04-24,2.0,0.48,0.0,1.0,0.48,0.48,0,HOBBIES_1_268,TX,...,1.96,24.50,2.041667,1.437206,0.23,4.97,0.88,2016,04,24
