# Using a language model in snowflake

Example notebook to use a large language model inside snowflake.
We need the specific python transformer 4.14.1 version that is also present in the snowpark anaconda channel.

Install:
pip install transformers==4.14.1 --user
pip install torch

Basically perform the follwoing steps
* load a model with the transformer package, 
* dump the model to disk with joblib
* create a STAGE in snowflake and uplaod the dumped model there
* write a python UDF that reads/imports the model and scores a text in a table


In [2]:
#### function to see the current Snowflake Environment Details
def current_snowflake_env():
    snowflake_environment = session.sql('select current_user(), current_role(), current_database(), current_schema(), current_version(), current_warehouse()').collect()
    print('User                     : {}'.format(snowflake_environment[0][0]))
    print('Role                     : {}'.format(snowflake_environment[0][1]))
    print('Database                 : {}'.format(snowflake_environment[0][2]))
    print('Schema                   : {}'.format(snowflake_environment[0][3]))
    print('Warehouse                : {}'.format(snowflake_environment[0][5]))
    print('Snowflake version        : {}'.format(snowflake_environment[0][4]))


In [3]:
import pandas as pd

from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

### import connection parameters such as account, user, password, warehouse, database, schema
from connection_config import connection_parameters

#### Set up a connection with Snowflake using snowpark and see the current environment details
session = Session.builder.configs(connection_parameters).create()
current_snowflake_env()

User                     : LONGHOW
Role                     : SNOWPARK_USERS
Database                 : DS_DATA
Schema                   : PUBLIC
Warehouse                : SNOWPARKOPTIMIZEDM
Snowflake version        : 7.19.2


Download the facebooks bart-large-mnli language model, see [here](https://huggingface.co/facebook/bart-large-mnli)

In [4]:
from transformers import pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)



  from .autonotebook import tqdm as notebook_tqdm


### Example of classifying a movie description

In [5]:
sequence_to_classify = "In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group"
movie_genres = [
    "Action",
    "Comedy",
    "Drama",
    "Thriller",
    "Horror",
    "Science Fiction",
    "Romance",
    "Adventure",
    "Fantasy",
    "Documentary"
]

### only one class can be predicted at a time
classifier(sequence_to_classify, movie_genres)

{'sequence': 'In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group',
 'labels': ['Science Fiction',
  'Action',
  'Documentary',
  'Thriller',
  'Adventure',
  'Fantasy',
  'Horror',
  'Romance',
  'Drama',
  'Comedy'],
 'scores': [0.2539590001106262,
  0.20386196672916412,
  0.11335238069295883,
  0.10166659206151962,
  0.09251900017261505,
  0.0795774832367897,
  0.0608665831387043,
  0.038360532373189926,
  0.033384546637535095,
  0.022451993077993393]}

In [7]:
sequence_to_classify = "In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group"
movie_genres = [
    "Action",
    "Comedy",
    "Drama",
    "Thriller",
    "Horror",
    "Science Fiction",
    "Romance",
    "Adventure",
    "Fantasy",
    "Documentary"
]

### multiple classes can be predicted at a time
classifier(sequence_to_classify, movie_genres, multi_label=True)

{'sequence': 'In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group',
 'labels': ['Science Fiction',
  'Action',
  'Thriller',
  'Documentary',
  'Fantasy',
  'Adventure',
  'Horror',
  'Drama',
  'Romance',
  'Comedy'],
 'scores': [0.13881969451904297,
  0.04137067124247551,
  0.005452941171824932,
  0.004392283037304878,
  0.0035810391418635845,
  0.0020033405162394047,
  0.0005535491509363055,
  0.00013339013094082475,
  0.00012060704466421157,
  7.123382238205522e-05]}

In [10]:
## dump the model to disk
import joblib
joblib.dump(classifier, 'bart-large-mnli.joblib')

['bart-large-mnli.joblib']

### create stage in the snowflake environment

In [9]:
session.sql("CREATE STAGE IF NOT EXISTS DS_DATA.PUBLIC.ZERO_SHOT_CLASSIFICATION").collect()  

[Row(status='ZERO_SHOT_CLASSIFICATION already exists, statement succeeded.')]

In [11]:
### now put the model that we dumped earlier into the snowflake STAGE
session.file.put(
   'bart-large-mnli.joblib',
   stage_location = 'DS_DATA.PUBLIC.ZERO_SHOT_CLASSIFICATION',
   overwrite=True,
   auto_compress=False
)

[PutResult(source='bart-large-mnli.joblib', target='bart-large-mnli.joblib', source_size=1630942026, target_size=1630942032, source_compression='NONE', target_compression='NONE', status='UPLOADED', message='')]

### Create a UDFs so that we can use the language model in snowflake

In [12]:
# Caching the model
import cachetools
import sys
@cachetools.cached(cache={})
def read_model():
   import_dir = sys._xoptions.get("snowflake_import_directory")
   if import_dir:
       # Load the model
       return joblib.load(f'{import_dir}/bart-large-mnli.joblib')

In [27]:
from snowflake.snowpark.functions import pandas_udf
from snowflake.snowpark.types import StringType, PandasSeriesType
@pandas_udf(  
       name='DS_DATA.PUBLIC.classify_movie_into_genre',
       session=session,
       is_permanent=True,
       replace=True,
       imports=[
           '@ZERO_SHOT_CLASSIFICATION/bart-large-mnli.joblib'
       ],
       input_types=[PandasSeriesType(StringType())],
       return_type=PandasSeriesType(StringType()),
       stage_location='DS_DATA.PUBLIC.ZERO_SHOT_CLASSIFICATION',
       packages=['cachetools==4.2.2', 'transformers==4.14.1']
   )
def get_review_classification(sentences: pd.Series) -> pd.Series:
    # Classify using the available categories
    movie_genres = [
        "Action",
        "Comedy",
        "Drama",
        "Thriller",
        "Horror",
        "Science Fiction",
        "Romance",
        "Adventure",
        "Fantasy",
        "Documentary"
    ]
    classifier = read_model()

    # Apply the model
    predictions = []
    for sentence in sentences:
       result = classifier(sentence, movie_genres)
       if 'scores' in result and 'labels' in result:
           category_idx = pd.Series(result['scores']).idxmax()
           predictions.append(result['labels'][category_idx])
       else:
           predictions.append(None)
    return pd.Series(predictions)

The version of package cachetools in the local environment is 5.3.0, which does not fit the criteria for the requirement cachetools==4.2.2. Your UDF might not work when the package version is different between the server and your local environment


In [12]:
### now you can run the get_review_classification function on data in SQL

SQL = """ 
SELECT
    TITLE,
    LISTED_IN,
    DESCRIPTION,
    classify_movie_into_genre(DESCRIPTION::VARCHAR)  as genre
FROM 
    NFLX_SHARE2.PUBLIC.NFLX 
WHERE TYPE = 'Movie'
LIMIT 100
"""

movies = session.sql(SQL)

In [13]:
movies.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"TITLE"  |"LISTED_IN"                                         |"DESCRIPTION"                                       |"GENRE"          |
---------------------------------------------------------------------------------------------------------------------------------------
|7:19     |Dramas, International Movies                        |After a devastating earthquake hits Mexico City...  |Action           |
|23:59    |Horror Movies, International Movies                 |When an army recruit is found dead, his fellow ...  |Horror           |
|9        |Action & Adventure, Independent Movies, Sci-Fi ...  |In a postapocalyptic world, rag-doll robots hid...  |Science Fiction  |
|21       |Dramas                                              |A brilliant group of students become card-count...  |Action           |
|122      |Horror Movies, International Movies  