# Create Virtual Environment

It is a best practice to create and activate a new Virtual environment when working with lab samples.  This helps to not overwrite the global settings of your main Python install or overwrite packages across your projects.

NOTE: These notebooks were tested with Python 3.11.

# Install Libraries

Install the necessary libraries to your Python environment.  It is a best practice to create a virtual environment to run these samples.  NOTE: these take ~10-15 mins to install.

In [None]:
!pip install psycopg2
!pip install pgvector
!pip install transformers
!pip install sentence_transformers
!pip install scikit-image
!pip install azure-ai-translation-text
!pip install matplotlib
!pip install azureml-sdk
!pip install langchain
!pip install openai
!pip install tiktoken

Import any modules, functions and classes.

In [None]:
import psycopg2
from pgvector.psycopg2 import register_vector
import json
import skimage

# Test loading Sentence Transformers

Use the following cell to test if your install worked correctly.

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Translate the data

Here you will use the Translation API to convert the source data into English and then save it to a file.

Update the translate api key and region to the one created in the deployment/setup.

In [None]:
import urllib.request
import os
import json

from azure.core.credentials import AzureKeyCredential   
from azure.ai.translation.text import TextTranslationClient, TranslatorCredential
from azure.ai.translation.text.models import InputTextItem

filename = 'metadata.json'
translated_filename = 'metadata_translated.json'

translate_api_key  = 'YOUR_API_KEY'
translate_api_region = 'YOUR_REGION'

#if you would like to start from scratch, run this function
def download_and_translate():
    #download the data...
    download_metadata('https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.2.json')

    with open(filename) as json_file:
        data = json.load(json_file)

    #translate the data
    for item in data:
        descriptions = []
        for description in item['descriptions']:
            descriptions.append(translate(description, 'de', 'en'))
        item['descriptions'] = descriptions

    #save the translated data
    with open('metadata_translated.json', 'w') as outfile:
        json.dump(data, outfile)
    
def download_metadata(url):
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)

def translate(text, source_language, target_language):
    try:
        text_translator = TextTranslationClient(credential = TranslatorCredential(translate_api_key, translate_api_region))
        input_text_elements = [ InputTextItem(text = text) ]
        response = text_translator.translate(content = input_text_elements, to = [target_language], from_parameter = source_language)
        translation = response[0] if response else None
        if translation:
            for translated_text in translation.translations:
                return translated_text.text
    except Exception as e:
        print(e)
            

# Translate the file

The translated file has already been provided as part of the repo, if you would like to translate to a different language, you can uncomment the following and execute the method.

In [None]:
#download_and_translate()

# Load the translated file

In [None]:
with open(translated_filename) as json_file:
    data = json.load(json_file)

# Enable Extensions

- Browse to the Azure Portal
- Browse to the `pgsqldevSUFFIXflex16` Azure Database for PostgreSQL Flexible Server
- Under Settings, select **Server parameters**
- Search for **azure.extensions**
- Set the `azure.extensions` to enable `vector` and `azure_ai` and `azure_storage`
- Select **Save**

The following code will create a connection to the server and then register pgvector. If you get a connection error, be sure that a firewall rule has been added for your client IP. Be sure to replace the `SUFFIX`.

In [None]:
dbhost = 'pgsqldevSUFFIXflex16.postgres.database.azure.com'
dbport = '5432'
dbuser = 'wsuser'
dbpass = 'Solliance123'
dbname = 'products'

dbconn = psycopg2.connect(host=dbhost, user=dbuser, password=dbpass,
    port=dbport, database=dbname , connect_timeout=10)
dbconn.set_session(autocommit=True)

In [None]:
#This will take ~8-10mins to execute...
cur = dbconn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

register_vector(dbconn)

# Encode method

This method is used to encode a set of sentences using the Hugging Face transformers.

In [None]:
#https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
def encode_sentence(sentences):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(sentences)
    return embeddings

# Generate Embeddings

This method is used to generate wrap calling the encoding

In [None]:
def generate_embeddings(data):
    r = {}
    r['url'] = data['url']
    r['descriptions'] = data['descriptions']
    r['split'] = data['split']
    vector = encode_sentence(data['descriptions'])
    r['descriptions_embeddings'] = vector
    return r

# Create table

Create a products table for storing the information.  Notice the vector dimension of 384.

In [None]:
cur.execute("DROP TABLE IF EXISTS products;")

cur.execute("""CREATE TABLE IF NOT EXISTS products(
  id bigserial primary key,
  description text,
  url text,
  split int,
  descriptions_embeddings vector(384)
);""")

# Generate Embeddings

This code will generate embeddings from the descriptions and save the information into the products table.  This will take about 5 minutes to run across 250 records.

In [None]:
#take top 250 for now...
top_250_data = data[:250]

for x in top_250_data:
    r = generate_embeddings(x)
    cur.execute("INSERT INTO products (url, description, split, descriptions_embeddings) VALUES (%s, %s, %s, %s)", (r['url'], r['descriptions'][0], r['split'], r['descriptions_embeddings'][0]))
    
cur.execute("""CREATE INDEX ON products
  USING ivfflat (descriptions_embeddings vector_l2_ops) WITH (lists = 100);""")

cur.execute("VACUUM ANALYZE products;")

cur.close()

# Test your Data

The following code will encode a query  with a set of descriptions, then use the embeddings to search the listing of products and their embeddings to find matches.  This will be looking for anything with the terms `red`, `sleevless`, `summer` and `wear`.

In [None]:
import numpy as np
from skimage import io
import matplotlib.pyplot as plt
import requests

cur = dbconn.cursor()

result = encode_sentence("red sleeveless summer wear")

cur.execute("""SELECT id, url, description, descriptions_embeddings
  FROM products
  ORDER BY descriptions_embeddings <-> %s limit 2;""",
  (np.array(result),))

r = cur.fetchall()
urls = []
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True

for x in r:
    url = x[1].split('?')[0]
    urldata = requests.get(url).content
    print("Product Item Id: " + str(x[0]))
    a = io.imread(url)
    plt.imshow(a)
    plt.axis('off')
    plt.show()

cur.close()

# Azure AI Studio

In addition to using the built in transformers via Python, you can also call models that are deployed in Azure.  In this example you will deploy an AI model in Azure Machine Learning and then call it from python. Note that this model is a text generation model and not an embedding model.

## Create Workspace

- Open the Azure Portal, browse to your lab resource group
- In the top search textbox, search for **Azure AI Studio**
- Select the **postgres** Azure AI resource
- Select the **Launch Azure AI Studio** link
- Select **New AI Project**, then select the **postgres** resource
- Select **Create an AI Project**
- Under **Components**, select **Deployments**
- Select **Create->Real-time endpoint**
- Select the **gpt2** model and then select **Confirm**
- Enter the following:
  - Virtual Machine : Select the smallest available image
  - Instance count **2**
- Select **Deploy**, it may take a couple minutes for the deployment to complete. You will know the deployment is complete with the **Provisioning state** changes to **Succeeded**
- Select the **Consume** tab
- Copy the **REST endpoint** and **primary key**
- Run the following cells

## Set your API Key and URL

In [None]:
#Enter your API URL and API Key here
uri = 'YOUR_API_URL'
api_key = 'YOUR_API_KEY'

## Test your model endpoint

You will need to wait until your deployment is provisioned, this can take 10-15 minutes.

In [57]:
import requests
import json

#set the sentance  to embed
data= {"input_data": "red sleeveless summer wear"}

res = requests.post(uri, headers={'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}, json=data).json()

print(res)

{'message': 'An unexpected error occurred in scoring script. Check the logs for more info.'}


## Azure OpenAI Studio

## GPT Text Embeddings

Now that you have deployed a simple model via Azure Machine Learning Studio, let's look at redo-ing our embeddings using a different embedding model called `text-embedding-ada-002`.

This model is not deployed via Azure Machine Learning Studio, but is a part of Azure OpenAI.

- Open the Azure Portal
- Search for **Azure Open AI**
- Under **Resource Management**, select **Keys and Endpoint**
- Record the endpoing and the key
- Under **Resource Management**, select **Model deployments**
- Select **Manage Delopments**
- Select **Create new deployment**
- Select the **text-embedding-ada-002** model
- For the deployment name, type **embeddings**
- Select **Create**
- Once the model is deployed, run the following cells to regenerate your embeddings. Be sure to replace the endpoint and key with the ones you just recorded.

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.azure_openai import AzureOpenAIEmbeddings

#get the openai embeddings
embedding_model = "embeddings" #this is the name of the model deployment in azure open ai (not the type of model)
azure_endpoint = "YOUR_API_URL"  #https://your-name.openai.azure.com/
azure_key = 'YOUR_API_KEY'  

embeddings = AzureOpenAIEmbeddings(
                deployment=embedding_model,
                openai_api_base=azure_endpoint,
                openai_api_key=azure_key,
                openai_api_type="azure",
            )

## Recofigure the methods

Change the methods to get the embeddings from Azure Open AI and the text-embedding-ada-002 model.

In [None]:
def encode_openai_sentence(sentence):
    return embeddings.embed_documents([sentence])[0]

def encode_openai_sentences(sentences):
    return embeddings.embed_documents(sentences)

def generate_openai_embeddings(data):
    r = {}
    r['url'] = data['url']
    r['descriptions'] = data['descriptions']
    r['split'] = data['split']
    vector = encode_openai_sentences(data['descriptions'])
    r['descriptions_embeddings'] = vector
    return r

## Clear the database

In [None]:
#reset the embeddings table to 1536 dimensions (the hugging face model was only 384)
cur = dbconn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

cur.execute("DROP TABLE IF EXISTS products;")

cur.execute("""CREATE TABLE IF NOT EXISTS products(
  id bigserial primary key,
  description text,
  url text,
  split int,
  descriptions_embeddings vector(1536)
);""")

## Regenerate the embeddings

The next set of code will overwrite the older hugging face embeddings with the newer Open AI embeddings.

In [None]:
#take top 250 for now...
top_250_data = data[:250]

#query the database
cur = dbconn.cursor()

for x in top_250_data:
    r = generate_openai_embeddings(x)
    cur.execute("INSERT INTO products (url, description, split, descriptions_embeddings) VALUES (%s, %s, %s, %s)", (r['url'], r['descriptions'][0], r['split'], r['descriptions_embeddings'][0]))

cur.close()

## Test your embeddings

Re-run the query to see if you get similar results.

In [None]:
#encode the sentence
result = encode_openai_sentence("red sleeveless summer wear")

cur = dbconn.cursor()

#do a search...
cur.execute("""SELECT id, url, description, descriptions_embeddings
  FROM products
  ORDER BY descriptions_embeddings <-> %s limit 2;""",
  (np.array(result),))

r = cur.fetchall()
urls = []
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True

for x in r:
    url = x[1].split('?')[0]
    urldata = requests.get(url).content
    print("Product Item Id: " + str(x[0]))
    a = io.imread(url)
    plt.imshow(a)
    plt.axis('off')
    plt.show()

cur.close()