# Imports

In [9]:
import warnings
warnings.filterwarnings("ignore")

# Load variables
import os
from dotenv import load_dotenv
load_dotenv()

# Snowpark Imports
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import pandas_udf
import snowflake.snowpark.types as T

# Other
import pandas as pd
from cachetools import cached

# Connect to Snowflake

In [10]:
snowflake_connection_cfg = {
    "ACCOUNT": os.getenv('SF_ACCOUNT'),
    "USER": os.getenv('SF_USER'),
    "ROLE": os.getenv('SF_ROLE'),
    "PASSWORD": os.getenv('SF_PASSWORD'),
    "DATABASE": os.getenv('SF_DATABASE'),
    "SCHEMA": os.getenv('SF_SCHEMA'),
    "WAREHOUSE": os.getenv('SF_WAREHOUSE')
}

# Creating Snowpark Session
session = Session.builder.configs(snowflake_connection_cfg).create()

## Create some Test Data
Goal is to retrieve sentences related to AI. Other sentences should have higher distance.

In [11]:
ai_sentences = [
    "AI refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.",
    "The potential for AI to improve healthcare is vast, with applications ranging from diagnostic tools to personalized medicine.",
    "AI technologies, such as machine learning and deep learning, are increasingly integral to advancements in sectors like finance, transportation, and security.",
    "Ethical considerations are crucial in the development and deployment of AI, ensuring technologies are used responsibly and do not perpetuate biases.",
    "As AI continues to evolve, the need for skilled professionals to design, manage, and oversee these systems is growing exponentially."
]

mixed_topics_sentences = [
    "Political campaigns increasingly leverage data analytics to target voters and optimize their messages.",
    "Major sporting events, such as the Olympics and the FIFA World Cup, significantly boost the economy of the host country through tourism and infrastructure improvements.",
    "In business, strategic mergers and acquisitions can reshape industries, creating new market leaders and altering competitive dynamics.",
    "The intersection of sports and politics often manifests when athletes take public stands on political issues, influencing public opinion and policy.",
    "Technological innovations in business, like blockchain and AI, are becoming pivotal in enhancing transparency and efficiency in financial transactions and governance."
]

search_text = 'AI is transforming industries by automating complex processes, enhancing decision-making with predictive analytics, and personalizing user experiences at an unprecedented scale.'

df = session.create_dataframe(ai_sentences+mixed_topics_sentences, schema=['TEXT'])
df.show()

------------------------------------------------------
|"TEXT"                                              |
------------------------------------------------------
|AI refers to the simulation of human intelligen...  |
|The potential for AI to improve healthcare is v...  |
|AI technologies, such as machine learning and d...  |
|Ethical considerations are crucial in the devel...  |
|As AI continues to evolve, the need for skilled...  |
|Political campaigns increasingly leverage data ...  |
|Major sporting events, such as the Olympics and...  |
|In business, strategic mergers and acquisitions...  |
|The intersection of sports and politics often m...  |
|Technological innovations in business, like blo...  |
------------------------------------------------------



# Deploy the Arctic XS Model

In [12]:
# Create a Stage to host our Functions
session.sql('CREATE OR REPLACE STAGE FUNCTIONS').collect()

# Create a stage for Hugging Face Models
session.sql('CREATE STAGE IF NOT EXISTS HF_MODELS').collect()

[Row(status='HF_MODELS already exists, statement succeeded.')]

In [13]:
import os
import joblib
os.environ["HF_HOME"] = '/tmp'

# Get the Arctic Model from Hugging Face
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-xs")
model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-xs", add_pooling_layer=False)

# Save the tokenizer and model using joblib
import joblib
joblib.dump(tokenizer, '/tmp/arctic_tokenizer_xs.joblib')
joblib.dump(model, '/tmp/arctic_model_xs.joblib')

# Upload to Snowflake
session.file.put('/tmp/arctic_tokenizer_xs.joblib', stage_location='@HF_MODELS/', auto_compress=False)
session.file.put('/tmp/arctic_model_xs.joblib', stage_location='@HF_MODELS/', auto_compress=False)

[PutResult(source='arctic_model_xs.joblib', target='arctic_model_xs.joblib', source_size=90321645, target_size=0, source_compression='NONE', target_compression='NONE', status='SKIPPED', message='')]

In [14]:
# Function to load Models (and cache them)
@cached(cache={})
def load_model_opt(import_dir) -> object:
    import joblib
    tokenizer = joblib.load(f'{import_dir}arctic_tokenizer_xs.joblib')
    model = joblib.load(f'{import_dir}arctic_model_xs.joblib')
    return tokenizer, model
    
# Actual Embedding-Function that will be registered in Snowflake
def embed_arctic_xs_opt(text: pd.Series) -> pd.Series:
    import sys
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    tokenizer, model = load_model_opt(import_dir)
    embeddings = model(**tokenizer(text.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512))[0][:, 0]
    return embeddings.tolist()

# Register Function as Vectorized Pandas UDF
embed_arctic_xs_opt = pandas_udf(
    session=session,
    name="EMBED_ARCTIC_XS_OPT", 
    func=embed_arctic_xs_opt,
    return_type=T.ArrayType(), 
    input_types=[T.StringType()], 
    packages=['pytorch','transformers','cachetools','joblib'], 
    is_permanent=True, 
    replace=True,
    stage_location='@FUNCTIONS', 
    external_access_integrations=['HF_INT'],
    imports=[
        '@hf_models/arctic_model_xs.joblib',
        '@hf_models/arctic_tokenizer_xs.joblib'
        ]
    )

Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.3, which does not fit the criteria for the requirement 'cachetools'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'joblib' in the local environment is 1.4.0, which does not fit the criteria for the requirement 'joblib'. Your UDF might not work when the package version is different between the server and your local environment.


In [15]:
# Run Embedding Model
embedded_df_xs = df.with_column('EMBEDDING_XS', embed_arctic_xs_opt('TEXT').cast(T.VectorType(float,384)))
embedded_df_xs.write.save_as_table('EMBEDDINGS_XS', mode='overwrite')

# Outputs:
embedded_df_xs = session.table('EMBEDDINGS_XS')
embedded_df_xs.show()

-----------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_XS"                                      |
-----------------------------------------------------------------------------------------------------------
|Technological innovations in business, like blo...  |[0.1598115861415863, 0.6208478808403015, -0.335...  |
|Ethical considerations are crucial in the devel...  |[-0.29632332921028137, 0.563687801361084, -0.23...  |
|In business, strategic mergers and acquisitions...  |[0.21651607751846313, 0.7585282325744629, 0.070...  |
|The potential for AI to improve healthcare is v...  |[-0.0001352909894194454, 0.40068864822387695, 0...  |
|AI technologies, such as machine learning and d...  |[0.2519180178642273, 0.3343959450721741, -0.007...  |
|The intersection of sports and politics often m...  |[-0.4530579745769501, 0.24810077250003815, 0.19...  |
|AI refers to the simulation

In [16]:
# Calculate distances to search query
distances = embedded_df_xs.with_column(
    'VECTOR_DISTANCE_XS_EMBEDDING', 
    F.vector_l2_distance(
        F.col('EMBEDDING_XS'),
        embed_arctic_xs_opt(F.lit(search_text)).cast(T.VectorType(float,384))
    )
).order_by('VECTOR_DISTANCE_XS_EMBEDDING').show()

--------------------------------------------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_XS"                                      |"VECTOR_DISTANCE_XS_EMBEDDING"  |
--------------------------------------------------------------------------------------------------------------------------------------------
|AI technologies, such as machine learning and d...  |[0.2519180178642273, 0.3343959450721741, -0.007...  |5.26300961887902                |
|As AI continues to evolve, the need for skilled...  |[-0.2427491992712021, 0.6662875413894653, -0.07...  |5.374090073325059               |
|The potential for AI to improve healthcare is v...  |[-0.0001352909894194454, 0.40068864822387695, 0...  |5.620593230951895               |
|AI refers to the simulation of human intelligen...  |[0.28594380617141724, 0.23065607249736786, -0.1...  |6.300694067127768               |
|Ethical cons