# Imports

In [None]:
# Import python packages
import warnings
warnings.filterwarnings("ignore")
import streamlit as st

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.core import Root
from snowflake.cortex import complete
session = get_active_session()
root = Root(session)

# 1. Unstructured Data

## 1.1 Overview
In this notebook we are preparing __audio recordings__ from customer reviews and call center inquiries for usage in Cortex Agents.

Users will be able to ask questions like this to the Agent:
> ___"How did sentiment of customer reviews change for coffee products in 2024 per month?"___ 

This requires turning the audio recordings into a structured dataset which we can then query with standard SQL (using [Cortex Analyst](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-analyst)).

## 2. Turning Unstructured Data into Structured Data
  
With Snowflake's AISQL functions, you can easily generate structured insights from from unstructured data.

In this example we are transcribing customer reviews from `audio files` and combine it with other reviews already in the `__CUSTOMER_REVIEWS` table before we explore sentiments for different categories. 

In [None]:
-- Already collected customer reviews
SELECT * FROM __CUSTOMER_REVIEWS LIMIT 5;

In [None]:
-- Query files in Snowflake stage
SELECT 
  RELATIVE_PATH, 
  FL_GET_CONTENT_TYPE(TO_FILE(file_url)) as MIME_TYPE,
  FL_GET_FILE_TYPE(TO_FILE(file_url)) as FILE_MODALITY
FROM DIRECTORY(@AUDIO) LIMIT 5;

## 2.1 Transcribing Audio Files

Snowflake's [AI_TRANSCRIBE](https://docs.snowflake.com/en/sql-reference/functions/ai_transcribe) function transcribes audio files from a Snowflake stage.

In [None]:
CREATE OR REPLACE TABLE _CUSTOMER_REVIEW_RAW_TRANSCRIPTS AS
WITH TRANSCRIBED_CUSTOMER_REVIES AS (
    -- Transcribe audo files
    SELECT
      RELATIVE_PATH,
      SPLIT_PART(RELATIVE_PATH,'/',2)::TEXT AS FILENAME,
      SPLIT_PART(FILENAME,'_',0)::DATE AS REVIEW_DATE,
      TO_NUMBER(TO_CHAR(REVIEW_DATE, 'YYYYMMDD')) AS DATE_KEY,
      TO_FILE(FILE_URL) AS AUDIO_FILE,
      AI_TRANSCRIBE(AUDIO_FILE) AS TRANSCRIPTION
    FROM 
      DIRECTORY('@AUDIO')
    WHERE
      -- Filter on audio files
      FL_GET_FILE_TYPE(TO_FILE(file_url)) = 'audio'
)
SELECT
  DATE_KEY,
  RELATIVE_PATH,
  TRANSCRIPTION['text']::TEXT AS REVIEW_TEXT,
  TRANSCRIPTION['audio_duration']::FLOAT AS TRANSCRIPTION_DURATION,
FROM 
  TRANSCRIBED_CUSTOMER_REVIES;

SELECT * FROM _CUSTOMER_REVIEW_RAW_TRANSCRIPTS LIMIT 10;

## 3. Sentiment Analysis

Once we transcribed the audio files, we will use [AI_SENTIMENT](https://docs.snowflake.com/en/sql-reference/functions/ai_sentiment) to analyze the sentiment for different categories:
* Overall
* Brand
* Product
* Price
* Quality

This approach is called __Aspect-Based Sentiment Analysis__ and allows fine-granular analysis compared to simple sentiment analysis.
For example, a customer might say:
> _"I like the quality of the product but the price is too high"_  

The overall sentiment is mixed but if we analyze the different aspects (costs & quality) we understand that quality is perceived well by this customer but we need to improve on pricing.

___Note:___  
The dataset contains __more than 74.000 customer reviews__, totaling over __5 million tokens__.  
In order to save time (and credits), we only run the following codes on a subset of the data to see how it works.  
The entire result set is provided as an existing table called `FACT_CUSTOMER_REVIEWS`.

In [None]:
CREATE OR REPLACE TEMPORARY TABLE _SENTIMENTS_CUSTOMER_REVIEWS AS (
    -- Combine transcripts with existing reviews
    WITH COMBINED_DATA AS (
        SELECT DATE_KEY, REVIEW_TEXT FROM __CUSTOMER_REVIEWS
        UNION ALL BY NAME
        SELECT DATE_KEY, REVIEW_TEXT FROM _CUSTOMER_REVIEW_RAW_TRANSCRIPTS
        ORDER BY DATE_KEY
        -- Only use 10 rows for demo purposes
        LIMIT 10
    ),
    -- Simple sentiment analysis for 4 different categories
    BASIC_ANALYSIS AS (
        SELECT
            DATE_KEY,
            REVIEW_TEXT,
            LENGTH(REVIEW_TEXT) AS REVIEW_LENGTH,
            AI_SENTIMENT(
              REVIEW_TEXT,
              ['brand', 'product', 'price', 'quality']
            )['categories'] AS SENTIMENT_CATEGORIES
        FROM COMBINED_DATA LIMIT 5
    ),
    -- flatten the returned JSON from AI_SENTIMENT into 1 row per category
    -- also decode the returned values for easier aggregation
    -- finally pivot the rows into multiple columns
    FLATTENED_DATA AS (
        SELECT
            DATE_KEY,
            REVIEW_LENGTH,
            REVIEW_TEXT,
            flattened_data.VALUE['name']::TEXT AS SENTIMENT_CATEGORY,
            DECODE(
              flattened_data.VALUE['sentiment']::TEXT,
              'unknown', NULL,
              'positive', 1,
              'neutral', 0,
              'mixed', 0,
              'negative', -1
            ) AS SENTIMENT_VALUE,
        FROM BASIC_ANALYSIS,
          LATERAL FLATTEN(SENTIMENT_CATEGORIES) flattened_data
    )
    SELECT
        DATE_KEY,
        REVIEW_LENGTH,
        REVIEW_TEXT,
        pivoted_data."'overall'" AS SENTIMENT_OVERALL,
        pivoted_data."'brand'" AS SENTIMENT_BRAND,
        pivoted_data."'price'" AS SENTIMENT_PRICE,
        pivoted_data."'product'" AS SENTIMENT_PRODUCT,
        pivoted_data."'quality'" AS SENTIMENT_QUALITY
    FROM
        FLATTENED_DATA
    PIVOT (
        MAX(SENTIMENT_VALUE) 
        FOR SENTIMENT_CATEGORY IN (ANY ORDER BY SENTIMENT_CATEGORY)
    ) AS pivoted_data
);

SELECT * FROM _SENTIMENTS_CUSTOMER_REVIEWS LIMIT 10;

## 4. Matching Reviews to Product Hierarchy via LLMs

As a next step, we want to match the reviews to our product hierarchy to analyze sentiments for specific product categories or even subcategories.  
An easy way of doing this is to use [AI_CLASSIFY](https://docs.snowflake.com/en/sql-reference/functions/ai_classify) which receives the REVIEW_TEXT and all potential Product Categories.

This will allow business users to ask questions like:
> ___"How did customer's sentiment regarding our product quality for coffee products improve in 2025 compared to 2024?"___  

In [None]:
CREATE OR REPLACE TEMPORARY TABLE _SENTIMENTS_PRODUCTS_CUSTOMER_REVIEWS AS 
-- Join the first hierarchy level to each review
WITH REVIEWS_X_PRODUCT_SUBCATEGORIES AS (
    SELECT
        *
    FROM 
        __SENTIMENTS_CUSTOMER_REVIEWS
    CROSS JOIN
    (
      SELECT 
        ARRAYAGG(DISTINCT CATEGORY_HIER_1_NAME) CATEGORIES
      FROM 
        DIM_PRODUCT_HIERARCHY
    )
    -- Only use 10 rows for demo purposes
    LIMIT 10
),
-- use AI_CLASSIFY to extract hierarchy level 1
PRODUCT_CATEGORIES AS (
    SELECT 
        * EXCLUDE CATEGORIES,
        AI_CLASSIFY(
            REVIEW_TEXT, 
            CATEGORIES,
            {'output_mode': 'single'}
        )['labels'][0]::TEXT AS REVIEW_PRODUCT_CATEGORY,
    FROM 
        REVIEWS_X_PRODUCT_SUBCATEGORIES
        CROSS JOIN
        (
          SELECT
            CATEGORY_HIER_1_NAME,
            ARRAYAGG(DISTINCT CATEGORY_HIER_2_NAME) SUBCATEGORIES
          FROM 
            DIM_PRODUCT_HIERARCHY
          GROUP BY
            CATEGORY_HIER_1_NAME
        )
        WHERE REVIEW_PRODUCT_CATEGORY = CATEGORY_HIER_1_NAME
),
-- Join the second hierarchy level to each review and extract the hierarchy level 2 using AI_CLASSIFY
PRODUCT_SUBCATEGORIES AS (
    SELECT 
        * EXCLUDE CATEGORY_HIER_1_NAME,
        AI_CLASSIFY(
            REVIEW_TEXT, 
            SUBCATEGORIES,
            {
              'output_mode': 'single',
              'examples':[
                {
                  'input': 'I love the Champagne Chocolate Truffles.',
                  'labels': ['Premium Chocolate'],
                  'explanation': 'The review mentions a chocolate that contains champagne as a premium ingredient'
                },
                {
                  'input': 'The Milk Chocolate Classic exceeded all my expectations',
                  'labels': ['Chocolate bars'],
                  'explanation': 'The review does not mention any premium ingredient for the chocolate.'
                }
              ]
            }
        )['labels'][0]::TEXT AS REVIEW_PRODUCT_SUBCATEGORY  
    FROM 
        PRODUCT_CATEGORIES
)
SELECT 
  DATE_KEY,
  REVIEW_LENGTH,
  SENTIMENT_OVERALL,
  SENTIMENT_BRAND,
  SENTIMENT_PRICE,
  SENTIMENT_PRODUCT,
  SENTIMENT_QUALITY,
  CATEGORY_HIER_1_ID,
  CATEGORY_HIER_2_ID
FROM __SENTIMENTS_PRODUCTS_CUSTOMER_REVIEWS  REVIEWS
JOIN DIM_PRODUCT_HIERARCHY PROD_HIER
  ON REVIEWS.REVIEW_PRODUCT_CATEGORY = PROD_HIER.CATEGORY_HIER_1_NAME AND
     REVIEWS.REVIEW_PRODUCT_SUBCATEGORY = PROD_HIER.CATEGORY_HIER_2_NAME;

SELECT * FROM _SENTIMENTS_PRODUCTS_CUSTOMER_REVIEWS LIMIT 10;

In [None]:
-- The entire result set from the above SQL (using all 74K reviews)
SELECT * FROM FACT_CUSTOMER_REVIEWS LIMIT 10;