In [23]:
## Requirements:
!pip install azure-search-documents --pre
!pip install pydantic==1.10.11

In [2]:
# Import required libraries  
import os
import re
import pandas as pd
import json  
import openai  
from dotenv import load_dotenv
from dotenv import dotenv_values
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration
)
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
folder_path = "../DATA/transcripts" ## For step 1
chunk_size = 5000 ## For step 2: number of characters
chunk_overlap  = 100 ## For step 2
env_name = "../../llm.env" # change to use your own .env file

In [4]:
config = dotenv_values(env_name)

#Azure OpenAI
openai.api_type = config["OPENAI_API_TYPE"] #"azure"
openai.api_key = config['OPENAI_API_KEY']
openai.api_base = config['OPENAI_API_BASE']
openai.api_version = config['OPENAI_API_VERSION']

## Cog Search
cogsearch_name = config["COGSEARCH_NAME"] #TODO: fill in your cognitive search name
index_name = config["COGSEARCH_INDEX_NAME"] #TODO: fill in your index name: must only contain lowercase, numbers, and dashes
key = config["COGSEARCH_API_KEY"] #TODO: fill in your api key with admin key
service_endpoint = "https://"+config["COGSEARCH_NAME"] + ".search.windows.net"

credential = AzureKeyCredential(key)

## Step 1: Read all files from a folder

In [5]:
## Read file
def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content

# Extracts company name year and quarter from filename
def extractCompanyDetailsFromFilename(filename):
    pattern = r"^(.*?), (Q\d) (\d{4}).*\.txt$"
    match = re.match(pattern, filename)
    
    if match:
        company_name = match.group(1)
        quarter = match.group(2)
        year = match.group(3)
        data = [company_name, year, quarter]
        print(data)

    else:
        print("No match found.", filename)
        data = ["", "", ""]
    return data

# Extract Ticker
def extractTicker(text):
    ticker_pattern = r"Ticker:\s+(\w+)"
    ticker_match = re.search(ticker_pattern, text)
    if ticker_match:
        ticker = ticker_match.group(1)
#         print("Ticker:", ticker)
    else:
        ticker = None
        print("Ticker not found")
    return ticker


## Main Code
file_names = os.listdir(folder_path)

data = {"filename":[],
        "Company Name":[], 
        "Year":[], 
        "Quarter":[]}

for filename in file_names:
    temp = extractCompanyDetailsFromFilename(filename)
    data["filename"].append(folder_path + "/"+ filename)
    data["Company Name"].append(temp[0])
    data["Year"].append(temp[1])
    data["Quarter"].append(temp[2])
    
df = pd.DataFrame(data)

# Apply read_file function to each row and create a new 'FileContents' column
df['FileContents'] = df['filename'].apply(lambda filename: read_file(filename))

# Apply extractTicker function to each row and create a new 'Ticker' column
df['Ticker'] = df['FileContents'].apply(lambda text: extractTicker(text))


['Advance Auto Parts, Inc.', '2023', 'Q1']
['Target Corporation', '2021', 'Q4']
['The Kroger Co.', '2020', 'Q4']


In [6]:
df.head()

Unnamed: 0,filename,Company Name,Year,Quarter,FileContents,Ticker
0,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP
1,"../DATA/transcripts/Target Corporation, Q4 202...",Target Corporation,2021,Q4,Company: Target Corporation\nTicker: TGT\nISIN...,TGT
2,"../DATA/transcripts/The Kroger Co., Q4 2020.txt",The Kroger Co.,2020,Q4,Company: The Kroger Co.\nTicker: KR\nISIN: US5...,KR


## Step 2: Chunk FileContent

In [7]:


def createchunks(text, chunk_size = 25000, chunk_overlap  = 100):
    # Initialize the text splitter with custom parameters
    custom_text_splitter = RecursiveCharacterTextSplitter(
        # Set custom chunk size
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap,
        # Use length of the text as the size measure
        length_function = len,
        # Use only "\n\n" as the separator
#         separators = ['\n']
    )

    # Create the chunks
    custom_texts = custom_text_splitter.create_documents([text])
    result = [i.page_content for i in custom_texts]
    
    return result

# Apply createchunk function to each row and create a new 'FileContentsChunked' column
df['FileContentsChunked'] = df['FileContents'].apply(lambda content: createchunks(content, chunk_size = chunk_size,chunk_overlap  = chunk_overlap))

#Explode FileContentsChunked
exploded_df = df.explode('FileContentsChunked', ignore_index=True)
exploded_df['ChunkNumber'] = exploded_df.groupby(['filename', 'Company Name', 'Year','Quarter']).cumcount() + 1

exploded_df

Unnamed: 0,filename,Company Name,Year,Quarter,FileContents,Ticker,FileContentsChunked,ChunkNumber
0,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",1
1,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,This work helped us refine price targets for e...,2
2,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,"With that, I'll now turn the call over to Jeff...",3
3,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,"With that, let's open it up for questions. Op...",4
4,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,The challenge is twofold. We're not getting en...,5
5,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,So they established those targets for each cat...,6
6,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,Most of the progress we've made so far has com...,7
7,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,to think about it.<br>The next question comes ...,8
8,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,I think I've got very strong resilient team he...,9
9,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,We're going to continue to look for ways to dr...,10


## Step 3: Create Embeddings

In [8]:
## Make it fast by changing min, max below
# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"])
    embeddings = response['data'][0]['embedding']
    return embeddings

In [9]:
# Apply the read_file function to each row and create a new 'FileContents' column
exploded_df['embeddings'] = exploded_df['FileContentsChunked'].apply(lambda content: createEmbeddings(content))

exploded_df.head()


Unnamed: 0,filename,Company Name,Year,Quarter,FileContents,Ticker,FileContentsChunked,ChunkNumber,embeddings
0,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",1,"[-0.004789761733263731, -0.01100434735417366, ..."
1,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,This work helped us refine price targets for e...,2,"[-0.019902214407920837, -0.0029587557073682547..."
2,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,"With that, I'll now turn the call over to Jeff...",3,"[-0.0056537180207669735, -0.00646374374628067,..."
3,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,"With that, let's open it up for questions. Op...",4,"[-0.007415952626615763, -0.01084926351904869, ..."
4,"../DATA/transcripts/Advance Auto Parts, Inc., ...","Advance Auto Parts, Inc.",2023,Q1,"Company: Advance Auto Parts, Inc.\nTicker: AAP...",AAP,The challenge is twofold. We're not getting en...,5,"[-0.00788076687604189, -0.009041925892233849, ..."


#### Step 4:  Store the embeddings in Azure Cognitive Search Vector Store

[AzureCogSearch](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) provides a simple interface to create a vector database, store and retrieve data using vector search. You can read more about [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main) more about Vector Search.

There are two steps to store data in AzureCogSearch vector database:
- First, we create the index (or schema) of the vector database
- Second, we add the chunked documents and their embeddings to the vector datastore

In [10]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="CompanyName", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Year", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Quarter", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Ticker", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="FileContentsChunked", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="ChunkNumber", type=SearchFieldDataType.String, filterable=True),
    
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="Ticker"),
#         prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="FileContentsChunked")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 example_v1 created


In [11]:
## Upload data to Index
def batch_append_payload(df, search_client):
    """append payload for batch insertion (note: max 1000 rows per insertion) of embeddings to Cognitive Search"""
    value_list = []
    for index, row in df.iterrows():
        value_list.append(
            {
                "id": str(index),
                "CompanyName": row["Company Name"],
                "Year": row["Year"],
                "Quarter": row["Quarter"],
                "Ticker": row["Ticker"],
#                 "FileContents": row["FileContents"],
                "FileContentsChunked": row["FileContentsChunked"],
                "ChunkNumber": str(row["ChunkNumber"]),
                "contentVector": row['embeddings'],
#                 "@search.action": "upload"
            }
        )
        
#         print(len(value_list))
        
        if len(value_list)>= 1000:
            result = search_client.upload_documents(value_list)
            print(f"Uploaded {len(value_list)} payload")
            value_list = []
            
    result = search_client.upload_documents(value_list)
    print(f"Uploaded {len(value_list)} payload")
    
            
            
#     print('payload of size {}'.format(len(value_list)))

    return value_list


search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
payload = batch_append_payload(exploded_df, search_client)
 
# print(f"Uploaded {len(payload)} payload") 


payload of size 51
Uploaded 51 payload


## Search Types 1: Pure Vector Search

In [12]:
# Pure Vector Search
query = "Advanced Auto parts earning call for year 2022 for Quarter 2"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector = Vector(value=createEmbeddings(query), k=2, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vectors= [vector],
#     select=["Ticker", "Quarter", "Year"],
)

# results
  
for result in results: 
    print(result['Ticker'])
    print(result['Quarter'])
    print(result['Year'])
    break

AAP
Q1
2023


## Search Types 2: Pure Filter

In [13]:
results = search_client.search(  
    search_text=None,  
    filter="(Ticker eq 'AAP') and (Year eq '2023') and (Quarter eq 'Q1') ",
)  

for result in results:
    print(f"Ticker: {result['Ticker']}")
    print(f"Quarter: {result['Quarter']}") 
    print(f"Year: {result['Year']}") 
    print(f"ChunkNumber: {result['ChunkNumber']}") 
    print()

#     break

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 1

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 4

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 10

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 3

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 5

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 6

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 12

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 8

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 7

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 9

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 2

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 11



## Search Types 3: Vector Search with filters

In [14]:
# Pure Vector Search with Filter
query = "What are the KPIs?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  
vector = Vector(value=createEmbeddings(query), k=5, fields="contentVector")  

results = search_client.search(  
    search_text=None,  
    vectors=[vector],
    filter="(Ticker eq 'AAP') and (Year eq '2023') and (Quarter eq 'Q1') ",
#     select=["Ticker", "Quarter", "Year"],
)  
  
for result in results:
    print(f"Ticker: {result['Ticker']}")
    print(f"Quarter: {result['Quarter']}") 
    print(f"Year: {result['Year']}") 
    print(f"ChunkNumber: {result['ChunkNumber']}") 
    print()

#     break

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 6

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 10

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 1

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 5

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 7



## Search Types 4: Hybrid Search with filters

In [15]:
# Pure Vector Search with Filter
query = "What are the KPIs?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  
vector = Vector(value=createEmbeddings(query), k=5, fields="contentVector")  

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    filter="(Ticker eq 'AAP') and (Year eq '2023') and (Quarter eq 'Q1') ",
#     select=["Ticker", "Quarter", "Year"],
    top = 3
)  
  
for result in results:
    print(f"Ticker: {result['Ticker']}")
    print(f"Quarter: {result['Quarter']}") 
    print(f"Year: {result['Year']}") 
    print(f"ChunkNumber: {result['ChunkNumber']}") 
    print()

#     break

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 1

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 10

Ticker: AAP
Quarter: Q1
Year: 2023
ChunkNumber: 6

