# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

# Load variables
import os
from dotenv import load_dotenv
load_dotenv()

# Snowpark Imports
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import pandas_udf
import snowflake.snowpark.types as T

# Other
import pandas as pd
from cachetools import cached

# Connect to Snowflake

In [2]:
snowflake_connection_cfg = {
    "ACCOUNT": os.getenv('SF_ACCOUNT'),
    "USER": os.getenv('SF_USER'),
    "ROLE": os.getenv('SF_ROLE'),
    "PASSWORD": os.getenv('SF_PASSWORD'),
    "DATABASE": os.getenv('SF_DATABASE'),
    "SCHEMA": os.getenv('SF_SCHEMA'),
    "WAREHOUSE": os.getenv('SF_WAREHOUSE')
}

# Creating Snowpark Session
session = Session.builder.configs(snowflake_connection_cfg).create()

# Use Snowpark Optimized Warehouse
session.use_warehouse('snowpark_opt_wh')

print('Role:     ', session.get_current_role())
print('Warehouse:', session.get_current_warehouse())
print('Database: ', session.get_current_database())
print('Schema:   ', session.get_current_schema())

Role:      "ACCOUNTADMIN"
Warehouse: "SNOWPARK_OPT_WH"
Database:  "MACHINE_LEARNING"
Schema:    "PUBLIC"


## Create an External Access Integration
This allows Snowflake to download models from Hugginface.

In [3]:
session.sql("""CREATE OR REPLACE NETWORK RULE hf_rule
                  MODE = EGRESS
                  TYPE = HOST_PORT
                  VALUE_LIST = ('huggingface.co','cdn-lfs-us-1.huggingface.co')""").collect()


session.sql("""CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION hf_int
                  ALLOWED_NETWORK_RULES = (hf_rule)
                  ENABLED = true;""").collect()

[Row(status='Integration HF_INT successfully created.')]

## Create some Test Data
Goal is to retrieve sentences related to AI. Other sentences should have higher distance.

In [4]:
ai_sentences = [
    "AI refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.",
    "The potential for AI to improve healthcare is vast, with applications ranging from diagnostic tools to personalized medicine.",
    "AI technologies, such as machine learning and deep learning, are increasingly integral to advancements in sectors like finance, transportation, and security.",
    "Ethical considerations are crucial in the development and deployment of AI, ensuring technologies are used responsibly and do not perpetuate biases.",
    "As AI continues to evolve, the need for skilled professionals to design, manage, and oversee these systems is growing exponentially."
]

mixed_topics_sentences = [
    "Political campaigns increasingly leverage data analytics to target voters and optimize their messages.",
    "Major sporting events, such as the Olympics and the FIFA World Cup, significantly boost the economy of the host country through tourism and infrastructure improvements.",
    "In business, strategic mergers and acquisitions can reshape industries, creating new market leaders and altering competitive dynamics.",
    "The intersection of sports and politics often manifests when athletes take public stands on political issues, influencing public opinion and policy.",
    "Technological innovations in business, like blockchain and AI, are becoming pivotal in enhancing transparency and efficiency in financial transactions and governance."
]

search_text = 'AI is transforming industries by automating complex processes, enhancing decision-making with predictive analytics, and personalizing user experiences at an unprecedented scale.'

df = session.create_dataframe(ai_sentences+mixed_topics_sentences, schema=['TEXT'])
df.show()

------------------------------------------------------
|"TEXT"                                              |
------------------------------------------------------
|AI refers to the simulation of human intelligen...  |
|The potential for AI to improve healthcare is v...  |
|AI technologies, such as machine learning and d...  |
|Ethical considerations are crucial in the devel...  |
|As AI continues to evolve, the need for skilled...  |
|Political campaigns increasingly leverage data ...  |
|Major sporting events, such as the Olympics and...  |
|In business, strategic mergers and acquisitions...  |
|The intersection of sports and politics often m...  |
|Technological innovations in business, like blo...  |
------------------------------------------------------



## Create the Functions for Arctic Models from Snowflake
<b>Note:</b>  
These functions will download the model when being excuted.  
For <b><u>optimal performance</u></b>, I strongly recommend to upload the model files to a Snowflake stage first and then point the Functions to the model files on stage.  
You find this optimized approach the end.

In [5]:
# Create a stage to host our functions
session.sql('CREATE OR REPLACE STAGE FUNCTIONS').collect()

[Row(status='Stage area FUNCTIONS successfully created.')]

# Arctic XS Model

In [6]:
@cached(cache={})
def load_model() -> object:
    import os
    os.environ["HF_HOME"] = '/tmp'
    from transformers import AutoModel, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-xs")
    model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-xs", add_pooling_layer=False)
    return tokenizer, model
    

def embed_arctic_xs(text: pd.Series) -> pd.Series:
    tokenizer, model = load_model()
    embeddings = model(**tokenizer(text.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512))[0][:, 0]
    return embeddings.tolist()

embed_arctic_xs = pandas_udf(
    session=session,
    name="EMBED_ARCTIC_XS", 
    func=embed_arctic_xs,
    return_type=T.ArrayType(), 
    input_types=[T.StringType()], 
    packages=['pytorch','transformers','cachetools'], 
    is_permanent=True, 
    replace=True,
    stage_location='@FUNCTIONS', 
    external_access_integrations=['HF_INT']
    )

Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.3, which does not fit the criteria for the requirement 'cachetools'. Your UDF might not work when the package version is different between the server and your local environment.


In [7]:
# Run Embedding Model
embedded_df_xs = df.with_column('EMBEDDING_XS', embed_arctic_xs('TEXT').cast(T.VectorType(float,384)))
embedded_df_xs.write.save_as_table('EMBEDDINGS_XS', mode='overwrite')

# Outputs:
embedded_df_xs = session.table('EMBEDDINGS_XS')
embedded_df_xs.show()

-----------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_XS"                                      |
-----------------------------------------------------------------------------------------------------------
|The potential for AI to improve healthcare is v...  |[-0.0001352909894194454, 0.40068864822387695, 0...  |
|Political campaigns increasingly leverage data ...  |[-0.16542716324329376, 0.32436394691467285, -0....  |
|Major sporting events, such as the Olympics and...  |[0.16280654072761536, 1.099736213684082, 0.1863...  |
|The intersection of sports and politics often m...  |[-0.4530579745769501, 0.24810077250003815, 0.19...  |
|As AI continues to evolve, the need for skilled...  |[-0.2427491992712021, 0.6662875413894653, -0.07...  |
|AI technologies, such as machine learning and d...  |[0.2519180178642273, 0.3343959450721741, -0.007...  |
|Ethical considerations are 

In [8]:
# Calculate distances to search query
distances = embedded_df_xs.with_column(
    'VECTOR_DISTANCE_XS_EMBEDDING', 
    F.vector_l2_distance(
        F.col('EMBEDDING_XS'),
        embed_arctic_xs(F.lit(search_text)).cast(T.VectorType(float,384))
    )
).order_by('VECTOR_DISTANCE_XS_EMBEDDING').show()

--------------------------------------------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_XS"                                      |"VECTOR_DISTANCE_XS_EMBEDDING"  |
--------------------------------------------------------------------------------------------------------------------------------------------
|AI technologies, such as machine learning and d...  |[0.2519180178642273, 0.3343959450721741, -0.007...  |5.26300961887902                |
|As AI continues to evolve, the need for skilled...  |[-0.2427491992712021, 0.6662875413894653, -0.07...  |5.374090073325059               |
|The potential for AI to improve healthcare is v...  |[-0.0001352909894194454, 0.40068864822387695, 0...  |5.620593230951895               |
|AI refers to the simulation of human intelligen...  |[0.28594380617141724, 0.23065607249736786, -0.1...  |6.300694067127768               |
|Ethical cons

# Arctic S Model

In [9]:
@cached(cache={})
def load_model() -> object:
    import os
    os.environ["HF_HOME"] = '/tmp'
    from transformers import AutoModel, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-s")
    model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-s", add_pooling_layer=False)
    return tokenizer, model
    

def embed_arctic_s(text: pd.Series) -> pd.Series:
    tokenizer, model = load_model()
    embeddings = model(**tokenizer(text.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512))[0][:, 0]
    return embeddings.tolist()

embed_arctic_s = pandas_udf(
    session=session,
    name="EMBED_ARCTIC_S", 
    func=embed_arctic_s,
    return_type=T.ArrayType(), 
    input_types=[T.StringType()], 
    packages=['pytorch','transformers','cachetools'], 
    is_permanent=True, 
    replace=True,
    stage_location='@FUNCTIONS', 
    external_access_integrations=['HF_INT']
    )

Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.3, which does not fit the criteria for the requirement 'cachetools'. Your UDF might not work when the package version is different between the server and your local environment.


In [10]:
# Run Embedding Model
embedded_df_s = df.with_column('EMBEDDING_S', embed_arctic_s('TEXT').cast(T.VectorType(float,384)))
embedded_df_s.write.save_as_table('EMBEDDINGS_S', mode='overwrite')

# Outputs:
embedded_df_s = session.table('EMBEDDINGS_S')
embedded_df_s.show()

-----------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_S"                                       |
-----------------------------------------------------------------------------------------------------------
|Ethical considerations are crucial in the devel...  |[-0.07469506561756134, -0.08195497840642929, -0...  |
|As AI continues to evolve, the need for skilled...  |[-0.049789149314165115, -0.3286069929599762, -0...  |
|In business, strategic mergers and acquisitions...  |[0.18188203871250153, -0.2920404374599457, -0.1...  |
|Technological innovations in business, like blo...  |[-0.21616318821907043, -0.3488922715187073, -0....  |
|The intersection of sports and politics often m...  |[-0.17553821206092834, 0.049565475434064865, -0...  |
|The potential for AI to improve healthcare is v...  |[-0.12555864453315735, -0.1989528238773346, -0....  |
|Political campaigns increas

In [11]:
# Calculate distances to search query
distances = embedded_df_s.with_column(
    'VECTOR_DISTANCE_S_EMBEDDING', 
    F.vector_l2_distance(
        F.col('EMBEDDING_S'),
        embed_arctic_s(F.lit(search_text)).cast(T.VectorType(float,384))
    )
).order_by('VECTOR_DISTANCE_S_EMBEDDING').show()

-------------------------------------------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_S"                                       |"VECTOR_DISTANCE_S_EMBEDDING"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|AI technologies, such as machine learning and d...  |[-0.01815970055758953, -0.3376675248146057, -0....  |2.2046929589845377             |
|As AI continues to evolve, the need for skilled...  |[-0.049789149314165115, -0.3286069929599762, -0...  |2.2446368194301183             |
|The potential for AI to improve healthcare is v...  |[-0.12555864453315735, -0.1989528238773346, -0....  |2.3709271292298126             |
|Technological innovations in business, like blo...  |[-0.21616318821907043, -0.3488922715187073, -0....  |2.7660335950155925             |
|AI refers to the si

# Arctic M Model

In [12]:
@cached(cache={})
def load_model() -> object:
    import os
    os.environ["HF_HOME"] = '/tmp'
    from transformers import AutoModel, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m")
    model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-m", add_pooling_layer=False)
    return tokenizer, model
    

def embed_arctic_m(text: pd.Series) -> pd.Series:
    tokenizer, model = load_model()
    embeddings = model(**tokenizer(text.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512))[0][:, 0]
    return embeddings.tolist()

embed_arctic_m = pandas_udf(
    session=session,
    name="EMBED_ARCTIC_M", 
    func=embed_arctic_m,
    return_type=T.ArrayType(), 
    input_types=[T.StringType()], 
    packages=['pytorch','transformers','cachetools'], 
    is_permanent=True, 
    replace=True,
    stage_location='@FUNCTIONS', 
    external_access_integrations=['HF_INT']
    )

Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.3, which does not fit the criteria for the requirement 'cachetools'. Your UDF might not work when the package version is different between the server and your local environment.


In [13]:
# Run Embedding Model
embedded_df_m = df.with_column('EMBEDDING_M', embed_arctic_m('TEXT').cast(T.VectorType(float,768)))
embedded_df_m.write.save_as_table('EMBEDDINGS_M', mode='overwrite')

# Outputs:
embedded_df_m = session.table('EMBEDDINGS_M')
embedded_df_m.show()

-----------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_M"                                       |
-----------------------------------------------------------------------------------------------------------
|As AI continues to evolve, the need for skilled...  |[0.26411405205726624, 0.2557848393917084, -0.26...  |
|Ethical considerations are crucial in the devel...  |[-0.15665480494499207, 0.6648065447807312, -0.4...  |
|Political campaigns increasingly leverage data ...  |[-0.07612626254558563, 0.11559521406888962, -0....  |
|The intersection of sports and politics often m...  |[0.15341803431510925, 0.15423737466335297, -0.7...  |
|AI refers to the simulation of human intelligen...  |[0.0372220054268837, 0.5137598514556885, -0.245...  |
|The potential for AI to improve healthcare is v...  |[0.5622907280921936, 0.2611202001571655, -0.300...  |
|AI technologies, such as ma

In [14]:
# Calculate distances to search query
distances = embedded_df_m.with_column(
    'VECTOR_DISTANCE_M_EMBEDDING', 
    F.vector_l2_distance(
        F.col('EMBEDDING_M'),
        embed_arctic_m(F.lit(search_text)).cast(T.VectorType(float,768))
    )
).order_by('VECTOR_DISTANCE_M_EMBEDDING').show()

-------------------------------------------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_M"                                       |"VECTOR_DISTANCE_M_EMBEDDING"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|AI technologies, such as machine learning and d...  |[0.11811910569667816, 0.06497251987457275, -0.2...  |6.228082296681562              |
|The potential for AI to improve healthcare is v...  |[0.5622907280921936, 0.2611202001571655, -0.300...  |6.797850089449093              |
|Technological innovations in business, like blo...  |[0.1994236260652542, 0.2557758390903473, -0.275...  |7.604252589188865              |
|AI refers to the simulation of human intelligen...  |[0.0372220054268837, 0.5137598514556885, -0.245...  |7.612079116386081              |
|As AI continues to 

# Arctic L Model

In [15]:
@cached(cache={})
def load_model() -> object:
    import os
    os.environ["HF_HOME"] = '/tmp'
    from transformers import AutoModel, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-l")
    model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-l", add_pooling_layer=False)
    return tokenizer, model
    

def embed_arctic_l(text: pd.Series) -> pd.Series:
    tokenizer, model = load_model()
    embeddings = model(**tokenizer(text.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512))[0][:, 0]
    return embeddings.tolist()

embed_arctic_l = pandas_udf(
    session=session,
    name="EMBED_ARCTIC_L", 
    func=embed_arctic_l,
    return_type=T.ArrayType(), 
    input_types=[T.StringType()], 
    packages=['pytorch','transformers','cachetools'], 
    is_permanent=True, 
    replace=True,
    stage_location='@FUNCTIONS', 
    external_access_integrations=['HF_INT']
    )

Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.3, which does not fit the criteria for the requirement 'cachetools'. Your UDF might not work when the package version is different between the server and your local environment.


In [16]:
# Run Embedding Model
embedded_df_l = df.with_column('EMBEDDING_L', embed_arctic_l('TEXT').cast(T.VectorType(float,1024)))
embedded_df_l.write.save_as_table('EMBEDDINGS_L', mode='overwrite')

# Outputs:
embedded_df_l = session.table('EMBEDDINGS_L')
embedded_df_l.show()

-----------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_L"                                       |
-----------------------------------------------------------------------------------------------------------
|AI refers to the simulation of human intelligen...  |[0.926182746887207, -1.996230125427246, -0.8437...  |
|The intersection of sports and politics often m...  |[0.23365822434425354, -1.5001283884048462, -0.4...  |
|As AI continues to evolve, the need for skilled...  |[0.3951736390590668, -0.7839894890785217, -0.01...  |
|The potential for AI to improve healthcare is v...  |[0.4619120657444, -1.2435344457626343, -0.52466...  |
|Major sporting events, such as the Olympics and...  |[0.2846733629703522, -1.8977553844451904, -1.29...  |
|Technological innovations in business, like blo...  |[0.3795325458049774, -1.3688700199127197, -0.48...  |
|AI technologies, such as ma

In [17]:
# Calculate distances to search query
distances = embedded_df_l.with_column(
    'VECTOR_DISTANCE_L_EMBEDDING', 
    F.vector_l2_distance(
        F.col('EMBEDDING_L'),
        embed_arctic_l(F.lit(search_text)).cast(T.VectorType(float,1024))
    )
).order_by('VECTOR_DISTANCE_L_EMBEDDING').show()

-------------------------------------------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_L"                                       |"VECTOR_DISTANCE_L_EMBEDDING"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|AI technologies, such as machine learning and d...  |[0.936636745929718, -1.4382154941558838, -0.645...  |8.980712143966812              |
|The potential for AI to improve healthcare is v...  |[0.4619120657444, -1.2435344457626343, -0.52466...  |9.82795266393022               |
|As AI continues to evolve, the need for skilled...  |[0.3951736390590668, -0.7839894890785217, -0.01...  |10.337995974394726             |
|AI refers to the simulation of human intelligen...  |[0.926182746887207, -1.996230125427246, -0.8437...  |11.223088607665352             |
|Technological innov

# Optimal performance
Note:  
This approach is <b><u>much faster</u></b> than downloading the model files with every call.

In [18]:
# Create a stage for HF models
session.sql('CREATE STAGE IF NOT EXISTS HF_MODELS').collect()

[Row(status='HF_MODELS already exists, statement succeeded.')]

In [19]:
import os
import joblib
os.environ["HF_HOME"] = '/tmp'

from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-xs")
model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-xs", add_pooling_layer=False)

# Save the tokenizer and model using joblib
import joblib
joblib.dump(tokenizer, '/tmp/arctic_tokenizer_xs.joblib')
joblib.dump(model, '/tmp/arctic_model_xs.joblib')

# Upload to Snowflake
session.file.put('/tmp/arctic_tokenizer_xs.joblib', stage_location='@HF_MODELS/', auto_compress=False)
session.file.put('/tmp/arctic_model_xs.joblib', stage_location='@HF_MODELS/', auto_compress=False)

[PutResult(source='arctic_model_xs.joblib', target='arctic_model_xs.joblib', source_size=90321645, target_size=0, source_compression='NONE', target_compression='NONE', status='SKIPPED', message='')]

In [20]:
@cached(cache={})
def load_model_opt(import_dir) -> object:
    import joblib
    tokenizer = joblib.load(f'{import_dir}arctic_tokenizer_xs.joblib')
    model = joblib.load(f'{import_dir}arctic_model_xs.joblib')
    return tokenizer, model
    

def embed_arctic_xs_opt(text: pd.Series) -> pd.Series:
    import sys
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    tokenizer, model = load_model_opt(import_dir)
    embeddings = model(**tokenizer(text.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512))[0][:, 0]
    return embeddings.tolist()

embed_arctic_xs_opt = pandas_udf(
    session=session,
    name="EMBED_ARCTIC_XS_OPT", 
    func=embed_arctic_xs_opt,
    return_type=T.ArrayType(), 
    input_types=[T.StringType()], 
    packages=['pytorch','transformers','cachetools','joblib'], 
    is_permanent=True, 
    replace=True,
    stage_location='@FUNCTIONS', 
    external_access_integrations=['HF_INT'],
    imports=[
        '@hf_models/arctic_model_xs.joblib',
        '@hf_models/arctic_tokenizer_xs.joblib'
        ]
    )

Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.3, which does not fit the criteria for the requirement 'cachetools'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'joblib' in the local environment is 1.4.0, which does not fit the criteria for the requirement 'joblib'. Your UDF might not work when the package version is different between the server and your local environment.


In [21]:
# Run Embedding Model
embedded_df_xs = df.with_column('EMBEDDING_XS', embed_arctic_xs_opt('TEXT').cast(T.VectorType(float,384)))
embedded_df_xs.write.save_as_table('EMBEDDINGS_XS', mode='overwrite')

# Outputs:
embedded_df_xs = session.table('EMBEDDINGS_XS')
embedded_df_xs.show()

-----------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_XS"                                      |
-----------------------------------------------------------------------------------------------------------
|As AI continues to evolve, the need for skilled...  |[-0.2427491992712021, 0.6662875413894653, -0.07...  |
|In business, strategic mergers and acquisitions...  |[0.21651607751846313, 0.7585282325744629, 0.070...  |
|AI refers to the simulation of human intelligen...  |[0.28594380617141724, 0.23065607249736786, -0.1...  |
|AI technologies, such as machine learning and d...  |[0.2519180178642273, 0.3343959450721741, -0.007...  |
|Technological innovations in business, like blo...  |[0.1598115861415863, 0.6208478808403015, -0.335...  |
|Major sporting events, such as the Olympics and...  |[0.16280654072761536, 1.099736213684082, 0.1863...  |
|The potential for AI to imp

In [22]:
# Calculate distances to search query
distances = embedded_df_xs.with_column(
    'VECTOR_DISTANCE_XS_EMBEDDING', 
    F.vector_l2_distance(
        F.col('EMBEDDING_XS'),
        embed_arctic_xs_opt(F.lit(search_text)).cast(T.VectorType(float,384))
    )
).order_by('VECTOR_DISTANCE_XS_EMBEDDING').show()

--------------------------------------------------------------------------------------------------------------------------------------------
|"TEXT"                                              |"EMBEDDING_XS"                                      |"VECTOR_DISTANCE_XS_EMBEDDING"  |
--------------------------------------------------------------------------------------------------------------------------------------------
|AI technologies, such as machine learning and d...  |[0.2519180178642273, 0.3343959450721741, -0.007...  |5.26300961887902                |
|As AI continues to evolve, the need for skilled...  |[-0.2427491992712021, 0.6662875413894653, -0.07...  |5.374090073325059               |
|The potential for AI to improve healthcare is v...  |[-0.0001352909894194454, 0.40068864822387695, 0...  |5.620593230951895               |
|AI refers to the simulation of human intelligen...  |[0.28594380617141724, 0.23065607249736786, -0.1...  |6.300694067127768               |
|Ethical cons

# Performance Test

In [23]:
# Create test data
session.generator(F.lit('Snowflake is awesome.').as_('TEXT'), rowcount=1000).write.save_as_table('PERFORMANCE_TEST', mode='overwrite')
df_test = session.table('PERFORMANCE_TEST')
df_test.show()
df_test.count()

-------------------------
|"TEXT"                 |
-------------------------
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
|Snowflake is awesome.  |
-------------------------



1000

In [24]:
%%time
# Run Embedding Model and materialize
df_test.with_column('EMBEDDING_XS', embed_arctic_xs('TEXT').cast(T.VectorType(float,384))).write.save_as_table('EMBEDDINGS_XS', mode='overwrite')

CPU times: user 7.69 ms, sys: 2.94 ms, total: 10.6 ms
Wall time: 15.8 s


In [25]:
%%time
# Run Embedding Model (Optimized) and materialize
df_test.with_column('EMBEDDING_XS', embed_arctic_xs_opt('TEXT').cast(T.VectorType(float,384))).write.save_as_table('EMBEDDINGS_XS', mode='overwrite')

CPU times: user 7.87 ms, sys: 2.4 ms, total: 10.3 ms
Wall time: 13.9 s
