In [None]:
%pip install --upgrade pip
%pip install openai
%pip install azure-cosmos

In [5]:
import os
from openai import AzureOpenAI
from azure.cosmos import CosmosClient, PartitionKey

# Set up environment variables for Azure OpenAI
oai_client = AzureOpenAI(
    api_key = os.getenv('AZURE_OPENAI_API_KEY'), 
    api_version = os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
)

# Set up environment variables for Azure OpenAI Embeddings
emb_client = AzureOpenAI(
    api_key = os.getenv('AZURE_OPENAI_API_KEY'),  
    api_version = os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint = os.getenv('AZURE_OPENAI_EMBEDDINGS_ENDPOINT')
)

# Set up environment variables for Azure Cosmos DB
url = os.getenv('COSMOS_DB_ENDPOINT')
key = os.getenv('COSMOS_DB_KEY')
db_client = CosmosClient(url, credential=key)

In [6]:
# Create database if it doesn't exist
database = db_client.create_database_if_not_exists(id='interview-assistant')

# Create container if it doesn't exist
container = database.create_container_if_not_exists(
    id='chunks',
    partition_key=PartitionKey(path='/source')
)

In [12]:
import pandas as pd

# Specify the paths to text files
data_paths = ['../data/company.txt', '../data/job_description.txt', '../data/interview_tips.txt', '../data/resume.txt']

# Read the text files and create a DataFrame
rows = []

for path in data_paths:
    with open(path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        rows.append({'path': path, 'text': file_content})

df = pd.DataFrame(rows)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,path,text
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...
1,../data/job_description.txt,Job Title: Data Analytics & AI\nArtificial Int...
2,../data/interview_tips.txt,"# General Interview Tips\n\n- Be concise, clea..."
3,../data/resume.txt,﻿Mahmoud Jahanshahi | https://mahmoudjahanshah...


In [19]:
import re

# Function to split text into chunks based on word count
def split_text_with_overlap(text, max_words=100, min_words=40, overlap=20):
    """
    Splits text into sentence-based chunks with overlapping word windows.
    
    Args:
        text (str): The input text.
        max_words (int): Max words per chunk.
        min_words (int): Min words per chunk (ignored for final chunk).
        overlap (int): Number of words to overlap between chunks.
        
    Returns:
        list: List of overlapping sentence-based text chunks.
    """

    # Split by sentence boundaries: punctuation and newlines
    sentence_endings = re.compile(r'(?<=[.!?])\s+|\n+')
    sentences = sentence_endings.split(text.strip())

    chunks = []
    current_chunk = []
    word_count = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        words_in_sentence = sentence.split()
        current_chunk.extend(words_in_sentence)
        word_count += len(words_in_sentence)

        if word_count >= max_words:
            # Finalize current chunk
            chunk_text = ' '.join(current_chunk)
            chunks.append(chunk_text)

            # Create overlap for next chunk
            current_chunk = current_chunk[-overlap:]
            word_count = len(current_chunk)

    # Handle final chunk
    if current_chunk:
        if len(current_chunk) >= min_words or not chunks:
            chunks.append(' '.join(current_chunk))
        else:
            # Append to previous if too small
            chunks[-1] += ' ' + ' '.join(current_chunk)

    return chunks

In [20]:
# Split the text in the DataFrame into chunks
splitted_df = df.copy()
splitted_df['chunks'] = splitted_df['text'].apply(lambda x: split_text_with_overlap(x))

splitted_df.head()

Unnamed: 0,path,text,chunks
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,[# Avanade website: https://www.avanade.com/ W...
1,../data/job_description.txt,Job Title: Data Analytics & AI\nArtificial Int...,[Job Title: Data Analytics & AI Artificial Int...
2,../data/interview_tips.txt,"# General Interview Tips\n\n- Be concise, clea...","[# General Interview Tips - Be concise, clear,..."
3,../data/resume.txt,﻿Mahmoud Jahanshahi | https://mahmoudjahanshah...,[﻿Mahmoud Jahanshahi | https://mahmoudjahansha...


In [None]:
# Flatten the DataFrame to have one chunk per row
flattened_df = splitted_df.explode('chunks')

flattened_df.head()

Unnamed: 0,path,text,chunks
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,# Avanade website: https://www.avanade.com/ We...
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,only limit is what we can imagine together.​ -...
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,Build the right foundation​: Modernize your cl...
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,customers at risk.​ The Avanade story We are p...
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,than any other Microsoft partner Our partnersh...


In [24]:
# Function to create embeddings for text chunks
def create_embeddings(text, model=os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")):
    # Create embeddings for each document chunk
    embeddings = emb_client.embeddings.create(input = text, model=model).data[0].embedding
    return embeddings

In [25]:
# create embeddings for the whole data chunks and store them in a list
embeddings = []
for chunk in flattened_df['chunks']:
    embeddings.append(create_embeddings(chunk))

# store the embeddings in the dataframe
flattened_df['embeddings'] = embeddings

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,# Avanade website: https://www.avanade.com/ We...,"[0.00120907137170434, -0.029357684776186943, 0..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,only limit is what we can imagine together.​ -...,"[-0.01146011520177126, -0.02632567286491394, 0..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,Build the right foundation​: Modernize your cl...,"[-0.007249513640999794, -0.03372425585985184, ..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,customers at risk.​ The Avanade story We are p...,"[-0.0033569142688065767, -0.03979252651333809,..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,than any other Microsoft partner Our partnersh...,"[-0.0029829819686710835, -0.03445209935307503,..."


In [26]:
from sklearn.neighbors import NearestNeighbors

# Create the search index
nbrs = NearestNeighbors(n_neighbors=8, algorithm='ball_tree').fit(embeddings)

# To query the index, you can use the kneighbors method
distances, indices = nbrs.kneighbors(embeddings)

# Store the indices and distances in the DataFrame
flattened_df['indices'] = indices.tolist()
flattened_df['distances'] = distances.tolist()

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings,indices,distances
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,# Avanade website: https://www.avanade.com/ We...,"[0.00120907137170434, -0.029357684776186943, 0...","[0, 5, 4, 3, 12, 14, 2, 13]","[0.0, 0.4071333121535178, 0.43251759677292095,..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,only limit is what we can imagine together.​ -...,"[-0.01146011520177126, -0.02632567286491394, 0...","[1, 2, 5, 0, 3, 4, 13, 14]","[0.0, 0.4306872446037053, 0.49495046231599626,..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,Build the right foundation​: Modernize your cl...,"[-0.007249513640999794, -0.03372425585985184, ...","[2, 1, 5, 9, 0, 3, 7, 13]","[0.0, 0.4306872446037053, 0.4624938364504999, ..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,customers at risk.​ The Avanade story We are p...,"[-0.0033569142688065767, -0.03979252651333809,...","[3, 4, 5, 0, 12, 13, 2, 1]","[0.0, 0.3564433860068678, 0.415974198320961, 0..."
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,than any other Microsoft partner Our partnersh...,"[-0.0029829819686710835, -0.03445209935307503,...","[4, 3, 5, 0, 12, 13, 14, 1]","[0.0, 0.3564433860068678, 0.42238356793682935,..."


In [29]:
# text question
question = "whay are you a good fit for this job?"

# Convert the question to a query vector
query_vector = create_embeddings(question)

# Find the most similar documents
distances, indices = nbrs.kneighbors([query_vector], n_neighbors=5)

# Loop through the top 3 results
for i in range(5):
    idx = indices[0][i]
    print(f"\n--- Result {i+1} ---")
    print("Chunk Text:\n", flattened_df['chunks'].iloc[idx])
    print("Source Path:", flattened_df['path'].iloc[idx])
    print("Distance:", distances[0][i])


--- Result 1 ---
Chunk Text:
 # General Interview Tips - Be concise, clear, and structured in all your answers. - Use the STAR method (Situation, Task, Action, Result) for behavioral questions. - Practice common openers like "Tell me about yourself" and "Why do you want this role?" - Align your answers with the company’s mission, culture, and the job description. # STAR Method Example Q: Tell me about a time you solved a difficult problem. A: "At my previous job, we encountered X (Situation). My task was Y (Task). I did A, B, and C (Action). As a result, Z happened (Result)." # Technical Interview Tips
Source Path: ../data/interview_tips.txt
Distance: 0.6526668044130024

--- Result 2 ---
Chunk Text:
 the latest in AI/ML? - Describe a time when you had to explain a complex concept to a non-technical stakeholder. - Why do you want to work at Avanade? # Final Tips - Prepare 1–2 smart questions to ask at the end (e.g., “How does Avanade define success in this role?”). - Practice answers a