## snowparkML test

### First set up the snowflake environment
We have a trial account setup

In [1]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F
from connection_config import connection_parameters

import pandas as pd

#### Current Environment Details
def current_snowflake_env():
    snowflake_environment = session.sql('select current_user(), current_role(), current_database(), current_schema(), current_version(), current_warehouse()').collect()
    print('User                     : {}'.format(snowflake_environment[0][0]))
    print('Role                     : {}'.format(snowflake_environment[0][1]))
    print('Database                 : {}'.format(snowflake_environment[0][2]))
    print('Schema                   : {}'.format(snowflake_environment[0][3]))
    print('Warehouse                : {}'.format(snowflake_environment[0][5]))
    print('Snowflake version        : {}'.format(snowflake_environment[0][4]))

#### Set up a connection with Snowflake
session = Session.builder.configs(connection_parameters).create()

In [2]:
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import OneHotEncoder

In [3]:
current_snowflake_env()

User                     : SNOWFLAKETRIALUSER
Role                     : ACCOUNTADMIN
Database                 : SNOWFLAKE_SAMPLE_DATA
Schema                   : TPCH_SF10
Warehouse                : COMPUTE_WH
Snowflake version        : 7.34.1


### Get some data cars

In [4]:
car_prices = pd.read_csv("https://raw.githubusercontent.com/longhowlam/snowpark_cars_model/master/autos_tekoop.zip", encoding = "ISO-8859-1")
### extract number form vermogen column
car_prices['power'] = car_prices['vermogen'].str.extract('(\d+)')
car_prices.shape

(231000, 14)

In [4]:
car_prices.head()

Unnamed: 0,bouwjaar,km_stand,brandstof,motorinhoud,vermogen,transmissie,type,kleur,deur,prijs,merk,model,vraagprijs,power
0,2018,54700,Elektrisch,,245kW,Automaat,Hatchback,Rood,5-deurs,â¬ 54.999,Tesla,Model,54999,245.0
1,2017,56266,Elektrisch,,,Automaat,Hatchback,Wit,5-deurs,â¬ 22.949,Volkswagen,e-Golf,22949,
2,2021,1498,Elektrisch,,,Automaat,SUV / Terreinwagen,Groen,5-deurs,â¬ 38.745,Opel,Mokka,38745,
3,2019,26805,Elektrisch,,150kW,Automaat,SUV / Terreinwagen,Wit,5-deurs,â¬ 37.900,Hyundai,Kona,37900,150.0
4,2020,2785,Elektrisch,,100kW,Automaat,Hatchback,Grijs,5-deurs,â¬ 20.495,Renault,Zoe,20495,100.0


### Setup a new database in snowflake and use that public schema

In [9]:
session.sql(query="CREATE OR REPLACE database cars_data").collect()
session.sql(query="USE SCHEMA cars_data.public").collect()

[Row(status='Statement executed successfully.')]

### upload the cars data to that snowflake database

In [6]:
# quote_identifiers – By default, identifiers, specifically database, schema, table and column names (from DataFrame.columns) will be quoted. 
# If set to False, identifiers are passed on to Snowflake without quoting, i.e. identifiers will be coerced to uppercase by Snowflake.

session.write_pandas(car_prices, "CAR_PRICES", auto_create_table = True, quote_identifiers = False, overwrite = True)


<snowflake.snowpark.table.Table at 0x22e147299a0>

### Now pretend we start from here
We have a snowflake table

In [10]:
session.sql('USE SCHEMA cars_data.public').collect()

[Row(status='Statement executed successfully.')]

In [13]:
cars_sf = session.table('CARS_DATA.PUBLIC.CAR_PRICES')

In [14]:
cars_sf.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"BOUWJAAR"  |"KM_STAND"  |"BRANDSTOF"  |"MOTORINHOUD"  |"VERMOGEN"  |"TRANSMISSIE"  |"TYPE"               |"KLEUR"  |"DEUR"    |"PRIJS"     |"MERK"      |"MODEL"  |"VRAAGPRIJS"  |"POWER"  |"MOTOR_INHOUD"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2018        |54700       |Elektrisch   |NULL           | 245kW      |Automaat       |Hatchback            | Rood    | 5-deurs  |â¬ 54.999  |Tesla       |Model    |54999         |245.0    |NULL            |
|2017        |56266       |Elektrisch   |NULL           |NULL        |Automaat       | Hatchback           |Wit      | 5-deurs  |â¬ 22.949  |Volkswagen  |e-Golf   |229

In [15]:
cars_sf = (
    cars_sf
    .with_column('age' , 2023 - cars_sf['BOUWJAAR'])
    .with_column('N_doors', cars_sf["DEUR"].substring(1,2))
)

In [16]:
cars_clean = (
    cars_sf
    .filter(F.col("MERK").in_(F.lit("Opel"), F.lit("Volvo"), F.lit("BMW")) )
    .filter(F.col("KM_STAND") <= 500000)
    .filter(F.col("AGE") <= 20 )
    .filter(F.col("TRANSMISSIE").in_(F.lit("Handgeschakeld"), F.lit("Automaat")) )
    .filter(F.col("VRAAGPRIJS") <= 100000)
    .filter(F.col("BRANDSTOF").in_(F.lit("Benzine"), F.lit("Diesel")) )
)

In [17]:
### split into train and test
df_train, df_test = cars_clean.random_split(weights=[0.9, 0.1], seed=0) 

In [18]:
######### define pipeline
### define features in the model
cat_cols = ["MERK","BRANDSTOF", "TRANSMISSIE"]
cat_cols_oe = ["MERK_OE" ,"BRANDSTOF_OE", "TRANSMISSIE_OE"]

# test pipe
pipe_test = Pipeline(
    steps = [("ohe", OneHotEncoder(input_cols=cat_cols, output_cols=cat_cols_oe, drop_input_cols = True))]
)
testdf = pipe_test.fit_transform(df_train)


  success, nchunks, nrows, ci_output = write_pandas(


In [19]:
testdf.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MERK_OE_BMW"  |"MERK_OE_Opel"  |"MERK_OE_Volvo"  |"BRANDSTOF_OE_Benzine"  |"BRANDSTOF_OE_Diesel"  |"TRANSMISSIE_OE_Automaat"  |"TRANSMISSIE_OE_Handgeschakeld"  |"BOUWJAAR"  |"KM_STAND"  |"MOTORINHOUD"  |"VERMOGEN"  |"TYPE"               |"KLEUR"  |"DEUR"    |"PRIJS"     |"MODEL"  |"VRAAGPRIJS"  |"POWER"  |"MOTOR_INHOUD"  |"AGE"  |"N_DOORS"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
from snowflake.ml.modeling.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(
    input_cols = ["BOUWJAAR", "KM_STAND"],
    label_cols = "VRAAGPRIJS",
    max_depth = 5
)

In [None]:
model.fit(cars_sf)