# Load environment variables and keys 

In [1]:
from dotenv import dotenv_values
# specify the name of the .env file name 
env_name = "example.env"
config = dotenv_values(env_name)

# Upload data to SQL DB

## Connect to database

In [2]:
import pyodbc

# Define Azure SQL database connection details
server = config["server"] 
database = config["database"] 
username = config["username"] 
password = config["password"] 
driver = '{ODBC Driver 18 for SQL Server}'

# Create a connection string
conn_str = f"DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password}"

# Establish a connection to the Azure SQL database
conn = pyodbc.connect(conn_str, autocommit=True)
cursor = conn.cursor()

## Create a table in the database

We will create a new table "foodreview" and load the data from the csv file.

In [3]:
table_name = "foodreview" 

# Drop previous table of same name if one exists
cursor.execute(f"DROP TABLE IF EXISTS {table_name};")
print("Finished dropping table (if existed)")

# Create a table
cursor.execute(f"CREATE TABLE {table_name} (Id int NOT NULL, CONSTRAINT PK_{table_name}_Id PRIMARY KEY CLUSTERED (Id), ProductId text, UserId text, ProfileName text, HelpfulnessNumerator integer, HelpfulnessDenominator integer, Score integer, Time bigint, Summary text, Text text);")
print("Finished creating table")

# Create a index
cursor.execute(f"CREATE INDEX idx_Id ON {table_name}(Id);")
print("Finished creating index")

Finished dropping table (if existed)
Finished creating table
Finished creating index


## Enable change tracking

This allows the us to automatically update the index when changes are made to the data.

In [4]:
try:
    cursor.execute(f"ALTER DATABASE {database} SET CHANGE_TRACKING = ON (CHANGE_RETENTION = 2 DAYS, AUTO_CLEANUP = ON)")
except Exception as e:
    print(e)

('42000', "[42000] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Change tracking is already enabled for database 'test_vector_notebook'. (5088) (SQLExecDirectW); [42000] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]ALTER DATABASE statement failed. (5069)")


In [5]:
try:
    cursor.execute(f"ALTER TABLE {table_name} ENABLE CHANGE_TRACKING WITH (TRACK_COLUMNS_UPDATED = ON)")
except Exception as e:
    print(e)

## Upload data

In [6]:
##Load Data
import numpy as np
import pandas as pd
df_all = pd.read_csv('../../DataSet/Reviews_small.csv')

# Split data into 'initial' and 'extra' data
df_initial = df_all[:50].copy()
df_extra = df_all[50:].copy().reset_index(drop = True)

In [7]:
df = df_initial 

# Specify the batch size
batch_size = 30

# Split the dataframe into batches
batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]

#Iterate over each batch and insert the data into the database
for batch in batches:
    # Convert the batch dataframe to a list of tuples for bulk insertion
    rows = [tuple(row) for row in batch.itertuples(index=False)]
    
    # Define the SQL query for bulk insertion
    query = f"INSERT INTO {table_name} (Id, ProductId, UserId, ProfileName, HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary, Text) \
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
    cursor.executemany(query, rows)

## Example query

In [8]:
# Assuming you have already established a connection and have a cursor object

# Execute the SELECT statement
try:
    cursor.execute(f"SELECT count(Id) FROM {table_name};")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
except (Exception, Error) as e:
    print(f"Error executing SELECT statement: {e}")

(50, )


## Commit changes

In [9]:
cursor.commit()
cursor.close()

# Set up data source connection in Cog Search

## Import needed CogSearch functions

In [10]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType,
    RawVectorQuery,
    VectorizableTextQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    AzureOpenAIEmbeddingSkill,  
    AzureOpenAIParameters,  
    AzureOpenAIVectorizer,  
    ExhaustiveKnnParameters,  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    FieldMapping,  
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,  
    IndexProjectionMode,  
    InputFieldMappingEntry,  
    OutputFieldMappingEntry,  
    PrioritizedFields,    
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    SearchIndexer,  
    SearchIndexerDataContainer,  
    SearchIndexerDataSourceConnection,  
    SearchIndexerIndexProjectionSelector,  
    SearchIndexerIndexProjections,  
    SearchIndexerIndexProjectionsParameters,  
    SearchIndexerSkillset,  
    SemanticConfiguration,  
    SemanticField,  
    SemanticSettings,  
    SplitSkill,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchAlgorithmMetric,  
    VectorSearchProfile,  
)  

## Create data source connection

In [11]:
key = config["cogsearch_api_key"]
cogsearch_credential = AzureKeyCredential(key)
service_endpoint = config["cogsearch_endpoint"]
index_name = config["cogsearch_index_name"]

ds_conn_str = f'Encrypt=True;TrustServerCertificate=False;Connection Timeout=30;Server=tcp:{server};Database={database};User ID={username};Password={password};'

In [12]:
ds_client = SearchIndexerClient(service_endpoint, cogsearch_credential)
container = SearchIndexerDataContainer(name=table_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-azuresql-connection",
    type="azuresql",
    connection_string=ds_conn_str,
    container=container
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'amazon-review-jordan-v1-azuresql-connection' created or updated


# Set up automatic indexing + vectorization

## Create index 

In [13]:
import openai
openai.api_type = config["openai_api_type"] #"azure"
openai.api_key = config['openai_api_key']
openai.api_base = config['openai_api_base'] #"https://synapseml-openai.openai.azure.com/"
openai.api_version = config['openai_api_version'] 
openai_deployment = config["openai_deployment_embedding"]
EMBEDDING_LENGTH = 1536

In [14]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=cogsearch_credential)

fields = [
    SearchField(name="Id", type=SearchFieldDataType.String, key=True,
                sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                vector_search_dimensions=EMBEDDING_LENGTH, vector_search_profile="my-vector-search-profile")
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-hnsw-config",
            kind=VectorSearchAlgorithmKind.HNSW
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="my-vector-search-profile",
            algorithm="my-hnsw-config",
            vectorizer="my-openai"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="my-openai",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=openai.api_base,
                deployment_id=openai_deployment,
                api_key=openai.api_key
            )
        )  
    ]  
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        prioritized_content_fields=[SemanticField(field_name="Id")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')


amazon-review-jordan-v1 created


## Create skillset

In [15]:
# Create a skillset  
skillset_name = f"{index_name}-skillset"  

split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=300,  
    page_overlap_length=20,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/Text"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ]  
)

embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=openai.api_base,  
    deployment_id=openai_deployment,  
    api_key=openai.api_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ]  
)  

index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector")
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  

skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],
    index_projections=index_projections  
)
  
client = SearchIndexerClient(service_endpoint, cogsearch_credential)  
client.create_or_update_skillset(skillset)  
print(f' {skillset.name} created')

 amazon-review-jordan-v1-skillset created


## Create indexer

TODO: Check connection; make timeout logic

In [16]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name#,
    # output_field_mappings=[FieldMapping(source_field_name="document/vector", target_field_name="vector")]
    # output_field_mappings=[FieldMapping(source_field_name="document/pages", target_field_name="chunk")]
)  
  
indexer_client = SearchIndexerClient(service_endpoint, cogsearch_credential)
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)
print(f' {indexer_name} created')

 amazon-review-jordan-v1-indexer created


In [17]:
# Get the status of the indexer  
indexer_status = indexer_client.get_indexer_status(indexer_name)
print(f"Indexer status: {indexer_status.status}")

Indexer status: running


# Perform queries

In [23]:
# Pure Vector Search
query = "Canned dog food"  
  
search_client = SearchClient(service_endpoint, index_name, credential=cogsearch_credential)
vector_query = VectorizableTextQuery(text=query, k=3, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k=3, fields="vector")
  
results = search_client.search(
    search_text=None,  
    vector_queries= [vector_query],
    select=["Id", "vector"],
    top=3
)  

for result in results:  
    print(result)


{'vector': [-0.00321785337, -5.506248e-05, -0.0167448167, -0.005976489, -0.006968133, 0.03263775, 0.000584005436, -0.04781191, -0.0271005146, -0.0291237347, -0.006292617, 0.0368173, -0.02382609, -0.0139761986, -0.0171707571, 0.00674850727, 0.04097023, 0.0024491623, -0.005254385, -0.00312800636, -0.02923022, 0.002094766, -0.000687995, -0.0109147457, -0.0115603125, 0.002612218, 0.0154004395, -0.0112741329, 0.00121875771, 0.0166516416, 0.0250639822, 0.0114005841, -0.0229076538, -0.004655405, 0.00492161838, -0.00314630847, 0.00756711327, -0.009949722, 0.0214567911, -0.0239325762, -0.004425796, 0.007347487, 0.0187813491, -0.020352006, -0.01981958, 0.0144420713, -0.0200458616, 0.00085105066, -0.00241089426, -0.0006351683, 0.0458951741, 0.0109679876, 0.000351484749, -0.005763518, 0.0143222753, -0.006449017, 0.01920729, -0.0105553577, 0.0153338863, -0.0161857679, 0.00637913635, 0.008279233, -0.0066187284, -0.005976489, 0.00358389644, -0.027526455, -0.0124587826, 0.00340753025, 0.0128581021, 0.