In [None]:
#!python -m pip install pypdf sentence-transformers faiss-cpu chromadb langchain_chroma --quiet



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load embedding model
embedding_model_name = "models/gemini-embedding-001"
model_name = "gemini-2.0-flash"

In [3]:
#read data
df=pd.read_csv("https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv")

#concat text
df["full_text"]=df["title"]+"++"+df["description"]

df.head(2)


Unnamed: 0,course_id,title,description,full_text
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...,Foundations of Machine Learning++Understand fo...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...,Deep Learning with TensorFlow and Keras++Explo...


In [4]:
#generate embeddings

from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model_name)

#db details

vector_db_path = "Mainak_Assignment_2"
os.makedirs(vector_db_path,exist_ok=True)

In [5]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_texts(texts=df["full_text"].to_list(),
                                 embedding= embeddings, 
                                 persist_directory=vector_db_path,collection_name="course_catalog",
                                    collection_metadata={"use_type":"TRAINING AND EXPERIMENTATION"} ,
                                    metadatas=[{"course_id": cid} for cid in df["course_id"]]) 


In [6]:
print("docs in vector DB")
print(len(vectorstore.get()['ids']))

docs in vector DB
25


In [8]:
# using vector db object to initialize a retriever object - to perform vector search/retrieval
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
retrieved_docs = retriever.invoke("Python Programming for data science")
len(retrieved_docs)

5

In [9]:
#### testing
ret_docs = vectorstore.similarity_search_with_score("Python programming for data science",k=5)
print(ret_docs[0])

(Document(id='34fb1d3e-c3e5-41b1-9086-d3a40c74a0f2', metadata={'course_id': 'C016'}, page_content='Python Programming for Data Science++Learn Python fundamentals for data science: variables, control flow, functions, and object-oriented programming. Advance to data handling with pandas, numerical computing with NumPy, and basic plotting with matplotlib. You’ll build reproducible data workflows, clean and transform datasets, and perform exploratory analysis, laying the groundwork for machine learning and statistical modeling projects.'), 0.15304824709892273)


In [10]:
retrieved_docs[1].page_content

'Foundations of Machine Learning++Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on labs use scikit-learn and Python to implement end-to-end workflows on real-world datasets, preparing learners for practical machine learning applications with interactive engaging exercises.'

In [11]:
# Perform similarity search in Chroma vector store
results = vectorstore.similarity_search_with_score("python programming for data science", k=5)

# Extract course IDs and similarity scores
recommendations = [(doc.metadata["course_id"], score) for doc, score in results]

print(results)

[(Document(id='34fb1d3e-c3e5-41b1-9086-d3a40c74a0f2', metadata={'course_id': 'C016'}, page_content='Python Programming for Data Science++Learn Python fundamentals for data science: variables, control flow, functions, and object-oriented programming. Advance to data handling with pandas, numerical computing with NumPy, and basic plotting with matplotlib. You’ll build reproducible data workflows, clean and transform datasets, and perform exploratory analysis, laying the groundwork for machine learning and statistical modeling projects.'), 0.2013748437166214), (Document(id='6a6bb430-efb0-4a99-b0cf-f3a674e4e662', metadata={'course_id': 'C001'}, page_content='Foundations of Machine Learning++Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on labs use scikit-learn and Python to im

In [12]:
print(recommendations)

[('C016', 0.2013748437166214), ('C001', 0.3806152939796448), ('C017', 0.3873451054096222), ('C003', 0.40436050295829773), ('C011', 0.4241919219493866)]


In [13]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
Answer this question using the provided context only. list recommendations and comment on relevance. Give the course id as well. ensure to give top 5 recoomended courses
.Give the response in markdown format.
{question}

Context:
{context}
"""

prompt = PromptTemplate.from_template(message)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nAnswer this question using the provided context only. list recommendations and comment on relevance. Give the course id as well. ensure to give top 5 recoomended courses\n.Give the response in markdown format.\n{question}\n\nContext:\n{context}\n')

In [14]:
from langchain.chat_models import init_chat_model
llm = init_chat_model(model_name, model_provider="google_genai")

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm
response = rag_chain.invoke("python programming")

print(response.content)

Based on the context provided, here are the top 5 recommended courses related to "Python Programming":

1.  **C016 - Python Programming for Data Science++**
    *   **Relevance:** This course directly focuses on Python programming within the context of data science, covering fundamental Python concepts and libraries like pandas, NumPy, and matplotlib. It's highly relevant for anyone interested in using Python for data analysis and machine learning.

2.  **C001 - Foundations of Machine Learning++**
    *   **Relevance:** While not exclusively a Python programming course, it heavily utilizes Python and scikit-learn for implementing machine learning algorithms.  A solid understanding of Python is likely a prerequisite or co-requisite for this course.

3.  **C003 - Natural Language Processing Fundamentals++**
    *   **Relevance:** This course uses Python libraries like Hugging Face and spaCy.

4.  **C004 - Computer Vision and Image Processing++**
    *   **Relevance:** This course uses Py

In [15]:
#INPUT 1
response_1 = rag_chain.invoke("""I’ve completed the ‘Python Programming for Data Science’ course and enjoy data 
visualization. What should I take next?""")

print(response_1.content)

Based on your interest in data visualization and having completed 'Python Programming for Data Science' (C016), here are my top 5 recommended courses:

1.  **Data Visualization with Tableau (C014)**

    *   **Relevance:** This course directly aligns with your interest in data visualization. It will expand your visualization skills beyond Python's `matplotlib` and introduce you to Tableau, a widely used data visualization tool.

2.  **R Programming and Statistical Analysis (C017)**

    *   **Relevance:** Since you enjoy data visualization, learning R and `ggplot2` will offer you another powerful tool for creating compelling visuals. Also, gaining statistical analysis skills will enable you to derive deeper insights from data.

3.  **SQL for Data Analysis (C012)**

    *   **Relevance:** To effectively visualize data, you often need to extract and prepare it first. SQL is essential for querying and manipulating data from databases. This course will enhance your data wrangling capabilit

In [16]:
#INPUT 2
response_2 = rag_chain.invoke("""I know Azure basics and want to manage containers and build CI/CD pipelines. 
Recommend courses.""")

print(response_2.content)

Based on your knowledge of Azure basics and your interest in managing containers and building CI/CD pipelines, here are the top 3 recommended courses from the provided context:

1.  **Containerization with Docker and Kubernetes (C009)**:
    *   **Relevance:**  This course directly addresses your interest in container management. It covers Docker and Kubernetes, essential tools for containerizing applications.
2.  **DevOps Practices and CI/CD (C008)**:
    *   **Relevance:** This course is highly relevant as it focuses on building CI/CD pipelines using Git, Jenkins/GitHub Actions, and infrastructure-as-code with Terraform. It will help you automate your software delivery process.
3.  **Cloud Computing with Azure (C007)**:
    *   **Relevance:** Since you already know Azure basics, this course will help you deepen your knowledge of Azure services, including Azure Kubernetes Service (AKS), which is relevant for container management on Azure.


In [17]:
#INPUT 3
response_3 = rag_chain.invoke("""My background is in ML fundamentals; I’d like to specialize in neural networks and 
production workflows..""")

print(response_3.content)

Based on your background in ML fundamentals and your interest in specializing in neural networks and production workflows, here are the top 5 recommended courses:

1.  **Course ID: C002** - Deep Learning with TensorFlow and Keras
    *   **Relevance:** This course directly addresses your interest in neural networks, providing hands-on experience with TensorFlow and Keras to build, train, and optimize deep learning models. It covers various neural network architectures, which is essential for specialization.

2.  **Course ID: C025** - MLOps: Productionizing Machine Learning
    *   **Relevance:** This course is highly relevant to your interest in production workflows. It focuses on deploying and maintaining ML models at scale, covering essential MLOps practices such as model versioning, CI/CD, and monitoring.

3.  **Course ID: C001** - Foundations of Machine Learning
    *   **Relevance:** While you mentioned having a background in ML fundamentals, this course will help reinforce concep

In [18]:
#INPUT 4
response_4 = rag_chain.invoke("""I want to learn to build and deploy microservices with Kubernetes—what courses fit 
best?""")

print(response_4.content)

Here are the top 5 recommended courses for learning to build and deploy microservices with Kubernetes, based on the provided context:

1.  **Course ID:** C009
    **Course Title:** Containerization with Docker and Kubernetes
    **Relevance:** Highly relevant. This course directly addresses the core technologies (Docker and Kubernetes) for containerizing and orchestrating microservices. It covers essential Kubernetes concepts like pods, deployments, services, and ingress, along with cluster provisioning, autoscaling, and Helm chart packaging, all crucial for microservices deployment.

2.  **Course ID:** C010
    **Course Title:** APIs and Microservices Architecture
    **Relevance:** Highly relevant. This course focuses on designing and implementing APIs and microservices architectures. It covers microservices patterns, containerized deployment, versioning strategies, and security best practices, providing a comprehensive understanding of building interconnected services.

3.  **Course

In [19]:
#INPUT 5
response_5 = rag_chain.invoke("""I’m interested in blockchain and smart contracts but have no prior experience. Which 
courses do you suggest?""")

print(response_5.content)

Based on your interest in blockchain and smart contracts with no prior experience, here's a recommendation:

**Top Recommended Courses:**

1.  **Blockchain Technology and Smart Contracts (C023)**

    *   **Relevance:** This course is highly relevant as it directly covers blockchain fundamentals, smart contract development using Solidity on Ethereum, token standards, decentralized application patterns, and security best practices. It's perfectly aligned with your stated interest.
