# Snowpark Transformation

In [3]:
# Snowpark for Python
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import StringType, ArrayType
from snowflake.snowpark import Window
from snowflake.snowpark.functions import *  ## for @UDF
from snowflake.snowpark.version import VERSION

# Misc
import json
import pandas as pd
from urllib import request
import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import date

## Create Snowflake connection

In [4]:
## Connection information
path = os.environ.get('CAS_CREDENTIALS')
connection_parameters = json.load(open(f'{path}\\connection.json'))

# Create Snowflake Session object
session = Session.builder.configs(connection_parameters).create()

## Version info
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

Snowflake version           : 7.28.0
Snowpark for Python version : 1.4.0


## Create demo database

In [None]:
session.sql('CREATE DATABASE MYDB').collect()

In [12]:
session.use_database('MYDB')

In [13]:
session.get_current_database()

'"MYDB"'

In [14]:
session.get_current_schema()

'"PUBLIC"'

## Create demo table

### Create pandas DataFrame

In [15]:
data = {
    'id' : [1,2,3,4,5],
    'website' : [
        'https://en.wikipedia.org/wiki/Snowflake_Inc.',
        'https://en.wikipedia.org/wiki/Amazon_Web_Services',
        'https://en.wikipedia.org/wiki/Google_Cloud_Platform',
        'https://en.wikipedia.org/wiki/Microsoft_Azure',
        'https://en.wikipedia.org/wiki/Apache_Spark'
    ],
    'html' : []
}

for item in data['website']:
    data['html'].append(requests.get(item).text)
    
pdDataFrame = pd.DataFrame.from_dict(data)

pdDataFrame.head()

Unnamed: 0,id,website,html
0,1,https://en.wikipedia.org/wiki/Snowflake_Inc.,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
1,2,https://en.wikipedia.org/wiki/Amazon_Web_Services,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
2,3,https://en.wikipedia.org/wiki/Google_Cloud_Pla...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
3,4,https://en.wikipedia.org/wiki/Microsoft_Azure,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
4,5,https://en.wikipedia.org/wiki/Apache_Spark,"<!DOCTYPE html>\n<html class=""client-nojs vect..."


### Create Snowflake data

In [19]:
## Create table structure
query = '''
    CREATE OR REPLACE TABLE wikipedia_pages (
        ID int,
        WEBSITE string,
        html string
    )
'''

session.sql(query).collect()

[Row(status='Table WIKIPEDIA_PAGES successfully created.')]

In [21]:
## Write pandas dataframe to snowflake table
sp_df = (
    session
    .write_pandas(
        df = pdDataFrame,
        table_name = 'WIKIPEDIA_PAGES',
        database = 'MYDB',
        schema = 'PUBLIC',
        quote_identifiers = False, ## Quotes around column names, so remove those
        overwrite = True
    )
)

In [None]:
sp_df.show(1)

## Create UDFs

### Test UDF locally

In [25]:
## Preview dataframe
print(pdDataFrame.loc[0,'html'][:500])

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>Snowflake Inc. - Wikipedia</tit


In [26]:
## Local function to test
def clean_html_dev(html):
    clean = BeautifulSoup(html, 'lxml').text
    clean = re.sub(r'\n+', '\n', clean)
    return clean

In [27]:
print(clean_html_dev(pdDataFrame.loc[0,'html'])[:500])


Snowflake Inc. - Wikipedia
Jump to content
Main menu
Main menu
move to sidebar
hide
		Navigation
	
Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate
		Contribute
	
HelpLearn to editCommunity portalRecent changesUpload file
Languages
Language links are at the top of the page across from the title.
Search
Search
Create accountLog in
Personal tools
 Create account Log in
		Pages for logged out editors learn more
ContributionsTalk
Contents
move to sidebar
hide
(Top)
1Hist


### Development UDF for Snowflake

In [28]:
@udf(return_type = StringType(), 
     input_types = [StringType()],
     name = 'clean_html',
     packages = ['BeautifulSoup4', 'lxml'],
     replace = True
    )
def clean_html_dev(html):
    clean = BeautifulSoup(html, 'lxml').text
    clean = re.sub(r'\n+', '\n', clean)
    return clean

In [30]:
sp_df.queries

{'queries': ['SELECT  *  FROM (MYDB.PUBLIC.WIKIPEDIA_PAGES)'],
 'post_actions': []}

In [33]:
sp_df = (sp_df
         .withColumn('website_text',
                     clean_html_dev('html'))
         .drop('html')
        )

sp_df.show(1)

------------------------------------------------------------------------------------------------------------
|"ID"  |"WEBSITE"                                     |"WEBSITE_TEXT"                                      |
------------------------------------------------------------------------------------------------------------
|1     |https://en.wikipedia.org/wiki/Snowflake_Inc.  |                                                    |
|      |                                              |Snowflake Inc. - Wikipedia                          |
|      |                                              |Jump to content                                     |
|      |                                              |Main menu                                           |
|      |                                              |Main menu                                           |
|      |                                              |move to sidebar                                     |
|      |           

## JSON Data

In [49]:
session.close()