In [5]:
import pandas as pd
from langchain_google_genai import GoogleGenerativeAIEmbeddings

import os

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv")

In [7]:
df.head(2)

Unnamed: 0,course_id,title,description
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...


In [8]:
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="madhan_assignment_2")
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [10]:


# Prepare data for ChromaDB
ids = df['course_id'].tolist()
documents = df['description'].tolist()  # ChromaDB stores the original text as documents
metadatas = []
description_embeddings = []
for desc in df['description']:
    embedding_vector = embeddings.embed_query(desc)
    description_embeddings.append(embedding_vector)
# Create metadata with id and name for each document
for _, row in df.iterrows():
    metadata = {
        'course_id': row['course_id'],
        'title': row['title']
    }
    metadatas.append(metadata)

# Add data to collection
collection.add(
    embeddings=description_embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"Successfully added {len(df)} documents to ChromaDB collection")
print(f"Collection count: {collection.count()}")

Successfully added 25 documents to ChromaDB collection
Collection count: 25


In [20]:
from google import genai
from pydantic import BaseModel
import json

# Define the output schema
class CourseRecommendation(BaseModel):
    course_id: list
    title: list

def get_course_recommendation(user_query: str, project_id: str, location: str):

    vector = embeddings.embed_query(user_query)
    query_results = collection.query(
        query_embeddings=vector,
        n_results=10
    )


    for i, (doc, metadata, distance) in enumerate(zip(
        query_results['documents'][0], 
        query_results['metadatas'][0],
        query_results['distances'][0]
    )):

        course_data = [
        {
            "course_id": meta['course_id'],
            "title": meta['title'],
            "description": doc
        }
        for meta, doc in zip(query_results['metadatas'][0], query_results['documents'][0])
        ]

    client = genai.Client(vertexai=True, project=project_id, location=location)
    
    # Create the prompt
    prompt = f"""
    Based on the user query and available course data, recommend the most  suitable courses.
    
    User Query: {user_query}
    
    Available Courses:
    {json.dumps(course_data, indent=2)}
    
    Return one or more course_ids and title of the most recommended courses in the following JSON format:
    {{"course_id": [], "title": []}}
    
    Do not include any other text or explanation.
    """
    
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[prompt],
        config={
            "response_schema": CourseRecommendation.model_json_schema(),
            "response_mime_type" : "application/json"
        }
    )
    
    # Parse and return the structured response
    return json.loads(response.text)

# Example usage
if __name__ == "__main__":
    PROJECT_ID = "bdc-trainings"
    LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")
    user_queries = [
    "I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?"
    ,"I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses."
    ,"My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows."
    ,"I want to learn to build and deploy microservices with Kubernetes—what courses fit best?"
    ,"I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?"
    ]

    # user_query = input()
    for user_query in user_queries:
        result = get_course_recommendation(user_query, PROJECT_ID, LOCATION)
        print ()
        print ("Your query : ")
        print (user_query)
        print ("Recommended course for you : ")
        print(result['course_id'])
        print(result['title'])
        print ("********************")
        print ()
    # Expected output: {"course_id": "C014", "title": "Data Visualization with Tableau"}


Your query : 
I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?
Recommended course for you : 
['C014', 'C015', 'C017']
['Data Visualization with Tableau', 'Business Intelligence with Power BI', 'R Programming and Statistical Analysis']
********************


Your query : 
I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.
Recommended course for you : 
['C009', 'C008']
['Containerization with Docker and Kubernetes', 'DevOps Practices and CI/CD']
********************


Your query : 
My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.
Recommended course for you : 
['C002', 'C025']
['Deep Learning with TensorFlow and Keras', 'MLOps: Productionizing Machine Learning']
********************


Your query : 
I want to learn to build and deploy microservices with Kubernetes—what courses fit best?
Recommended course for you : 
['C0

1321324
