In [None]:
%pip install --upgrade pip
%pip install openai
%pip install azure-cosmos

In [1]:
import os
from openai import AzureOpenAI
from azure.cosmos import CosmosClient, PartitionKey

# Set up environment variables for Azure OpenAI
oai_client = AzureOpenAI(
    api_key = os.getenv('AZURE_OPENAI_API_KEY'), 
    api_version = os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
)

# Set up environment variables for Azure OpenAI Embeddings
emb_client = AzureOpenAI(
    api_key = os.getenv('AZURE_OPENAI_API_KEY'),  
    api_version = os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint = os.getenv('AZURE_OPENAI_EMBEDDINGS_ENDPOINT')
)

# Set up environment variables for Azure Cosmos DB
url = os.getenv('COSMOS_DB_ENDPOINT')
key = os.getenv('COSMOS_DB_KEY')
db_client = CosmosClient(url, credential=key)

In [2]:
# Create database if it doesn't exist
database = db_client.create_database_if_not_exists(id='interview-assistant')

# Create container if it doesn't exist
container = database.create_container_if_not_exists(
    id='chunks',
    partition_key=PartitionKey(path='/source')
)

In [3]:
import pandas as pd

# Specify the paths to text files
data_paths = ['../data/company.txt', '../data/job_description.txt', '../data/interview_tips.txt', '../data/resume.txt']

# Read the text files and create a DataFrame
rows = []

for path in data_paths:
    with open(path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        rows.append({'path': path, 'text': file_content})

df = pd.DataFrame(rows)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,path,text
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...
1,../data/job_description.txt,Job Title: Data Analytics & AI\nArtificial Int...
2,../data/interview_tips.txt,"# General Interview Tips\n\n- Be concise, clea..."
3,../data/resume.txt,﻿Mahmoud Jahanshahi | https://mahmoudjahanshah...


In [4]:
import re

# Function to split text into chunks based on word count
def split_text_with_overlap(text, max_words=100, min_words=40, overlap=20):
    """
    Splits text into sentence-based chunks with overlapping word windows.
    
    Args:
        text (str): The input text.
        max_words (int): Max words per chunk.
        min_words (int): Min words per chunk (ignored for final chunk).
        overlap (int): Number of words to overlap between chunks.
        
    Returns:
        list: List of overlapping sentence-based text chunks.
    """

    # Split by sentence boundaries: punctuation and newlines
    sentence_endings = re.compile(r'(?<=[.!?])\s+|\n+')
    sentences = sentence_endings.split(text.strip())

    chunks = []
    current_chunk = []
    word_count = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        words_in_sentence = sentence.split()
        current_chunk.extend(words_in_sentence)
        word_count += len(words_in_sentence)

        if word_count >= max_words:
            # Finalize current chunk
            chunk_text = ' '.join(current_chunk)
            chunks.append(chunk_text)

            # Create overlap for next chunk
            current_chunk = current_chunk[-overlap:]
            word_count = len(current_chunk)

    # Handle final chunk
    if current_chunk:
        if len(current_chunk) >= min_words or not chunks:
            chunks.append(' '.join(current_chunk))
        else:
            # Append to previous if too small
            chunks[-1] += ' ' + ' '.join(current_chunk)

    return chunks

In [5]:
# Split the text in the DataFrame into chunks
splitted_df = df.copy()
splitted_df['chunks'] = splitted_df['text'].apply(lambda x: split_text_with_overlap(x))

splitted_df.head()

Unnamed: 0,path,text,chunks
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,[# Avanade website: https://www.avanade.com/ W...
1,../data/job_description.txt,Job Title: Data Analytics & AI\nArtificial Int...,[Job Title: Data Analytics & AI Artificial Int...
2,../data/interview_tips.txt,"# General Interview Tips\n\n- Be concise, clea...","[# General Interview Tips - Be concise, clear,..."
3,../data/resume.txt,﻿Mahmoud Jahanshahi | https://mahmoudjahanshah...,[﻿Mahmoud Jahanshahi | https://mahmoudjahansha...


In [6]:
# Flatten the DataFrame to have one chunk per row
flattened_df = splitted_df.explode('chunks')
flattened_df = flattened_df.reset_index(drop=True)

flattened_df.head()

Unnamed: 0,path,text,chunks
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,# Avanade website: https://www.avanade.com/ We...
1,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,only limit is what we can imagine together.​ -...
2,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,Build the right foundation​: Modernize your cl...
3,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,customers at risk.​ The Avanade story We are p...
4,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,than any other Microsoft partner Our partnersh...


In [7]:
# Function to create embeddings for text chunks
def create_embeddings(text, model=os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")):
    # Create embeddings for each document chunk
    embeddings = emb_client.embeddings.create(input = text, model=model).data[0].embedding
    return embeddings

In [8]:
# create embeddings for the whole data chunks and store them in a list
embeddings = []
for chunk in flattened_df['chunks']:
    embeddings.append(create_embeddings(chunk))

# store the embeddings in the dataframe
flattened_df['embeddings'] = embeddings

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings
0,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,# Avanade website: https://www.avanade.com/ We...,"[0.00120907137170434, -0.029357684776186943, 0..."
1,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,only limit is what we can imagine together.​ -...,"[-0.01146011520177126, -0.02632567286491394, 0..."
2,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,Build the right foundation​: Modernize your cl...,"[-0.007249513640999794, -0.03372425585985184, ..."
3,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,customers at risk.​ The Avanade story We are p...,"[-0.0033569142688065767, -0.03979252651333809,..."
4,../data/company.txt,# Avanade\nwebsite: https://www.avanade.com/\n...,than any other Microsoft partner Our partnersh...,"[-0.0029829819686710835, -0.03445209935307503,..."


In [9]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Create the search index
X = np.vstack(flattened_df['embeddings'].to_numpy())
nbrs = NearestNeighbors(n_neighbors=8, algorithm='auto').fit(X)

In [13]:
# Function to handle user input and generate a response
def chatbot(user_input, neighbors=8):
    """
    Handles user input, retrieves relevant documents, and generates a response using Azure OpenAI.
    Args:
        user_input (str): The user's question or input.
    Returns:
        str: The generated response from the AI assistant.
    """
    # Convert the question to a query vector
    query_vector = create_embeddings(user_input)

    # Find the most similar documents
    _, indices = nbrs.kneighbors([query_vector], n_neighbors=neighbors)

    # Use a set to avoid duplicates
    indices_set = set(indices[0])  

    # Retrieve text chunks
    context_chunks = [flattened_df['chunks'].iloc[i] for i in indices_set]

    # Combine context and user question
    context_text = "\n\n".join(context_chunks)
    prompt = f"Context:\n{context_text}\n\nQuestion: {user_input}"

    # Create message payload
    messages = [
        {"role": "system", "content": "You are an AI assistant that helps answering job interview questions."},
        {"role": "user", "content": prompt}
    ]

    # use chat completion to generate a response
    response = oai_client.chat.completions.create(
        model=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
        temperature=0.7,
        max_tokens=200,
        messages=messages
    )

    return response.choices[0].message.content.strip()

In [14]:
user_input = "why are you a good fit for this job?"
chatbot(user_input,3)

"In considering my suitability for this role at Avanade, I would highlight three key aspects: my technical expertise in AI/ML, my proven communication and leadership skills, and my alignment with Avanade's values and mission.\n\nFirstly, my technical background is directly aligned with the requirements of this position. I have extensive experience in building solutions using large language models (LLMs), which includes prompt engineering, semantic search, and the orchestration of multi-agent workflows. This expertise allows me to not only understand but also innovate on the forefront of AI technology, ensuring that the solutions I develop are both cutting-edge and practical.\n\nSecondly, my ability to communicate complex concepts effectively to both technical and non-technical stakeholders has been demonstrated on several occasions. For example, in a previous project, I was tasked with explaining the implications and potential of AI-driven analytics to a group of senior stakeholders wh