In [1]:
from dotenv import load_dotenv
from langchain_openai import AzureOpenAIEmbeddings
load_dotenv()

python-dotenv could not parse statement starting at line 17


True

In [2]:
import os
# Optional: kill legacy envs that cause the validation error

for bad in ("OPENAI_API_BASE", "OPENAI_API_TYPE"):
    os.environ.pop(bad, None)

embeddings_client = AzureOpenAIEmbeddings(
    # You can rely on env vars instead; passing explicitly is fine too:
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    openai_api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-01"),

    # Use your Azure **deployment name** for the embeddings model
    model="text-embedding-3-small",    # <-- your deployment name
    # dimensions=3072  # optional if you configured custom dims on text-embedding-3*
)

In [3]:
import pandas as pd
data = pd.read_csv("/home/zadmin/Desktop/GAAI-B4-Azure/datasets/assignment2dataset.csv")
data.head()

Unnamed: 0,course_id,title,description
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...


In [4]:
# Assignment 2: Course Recommendation Engine using Azure OpenAI Embeddings

import os
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from typing import List, Tuple


df = data

# Combine title + description columns
text_cols = [c for c in df.columns if 'title' in c.lower() or 'desc' in c.lower()]
df['full_text'] = df[text_cols].fillna('').agg(' '.join, axis=1)

# 5. Compute embeddings for all courses
print("Computing embeddings for all courses...")
course_embeddings = embeddings_client.embed_documents(df['full_text'].tolist())
X = np.array(course_embeddings)

# Build NearestNeighbors index
nn = NearestNeighbors(n_neighbors=10, metric='cosine')
nn.fit(X)

# Identify ID and title columns
id_col = next((c for c in df.columns if 'id' in c.lower()), None)
title_col = text_cols[0] if text_cols else df.columns[0]

# 6. Recommendation function
def recommend_courses(profile: str, completed_ids: List[str]) -> List[Tuple[str, float]]:
    query_emb = embeddings_client.embed_query(profile)
    distances, indices = nn.kneighbors([query_emb], n_neighbors=10)
    recs = []
    for idx, dist in zip(indices[0], distances[0]):
        cid = str(df.iloc[idx][id_col]) if id_col else str(idx)
        if cid not in completed_ids:
            recs.append((cid, 1 - dist))  # similarity = 1 - cosine distance
        if len(recs) >= 5:
            break
    return recs

# 7. Evaluate on 5 sample profiles
sample_profiles = [
    "I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?",
    "I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.",
    "My background is in ML fundamentals; I'd like to specialize in neural networks and production workflows.",
    "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
    "I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?"
]

for i, profile in enumerate(sample_profiles, 1):
    print(f"\nProfile {i}: {profile}")
    for cid, score in recommend_courses(profile, []):
        title = df[df[id_col] == cid].iloc[0][title_col] if id_col else df.iloc[int(cid)][title_col]
        print("For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?")
        print(f"  {cid} | {title} | similarity: {score:.4f}")

Computing embeddings for all courses...

Profile 1: I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?
For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?
  C016 | Python Programming for Data Science | similarity: 0.5605
For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?
  C014 | Data Visualization with Tableau | similarity: 0.4504
For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?
  C011 | Big Data Analytics with Spark | similarity: 0.4304
For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?
  C017 | R Programming and Statistical Analysis | similarity: 0.4244
For Profile 5:I'm interested in blockchain and smart contracts but have no prior

In [5]:
print("For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?")
for cid, score in recommend_courses("I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?", []):
        title = df[df[id_col] == cid].iloc[0][title_col] if id_col else df.iloc[int(cid)][title_col]
        print(f"  {cid} | {title} | similarity: {score:.4f}")

For Profile 5:I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?
  C023 | Blockchain Technology and Smart Contracts | similarity: 0.5762
  C021 | Cybersecurity Fundamentals | similarity: 0.3147
  C024 | Augmented and Virtual Reality Development | similarity: 0.3113
  C022 | Internet of Things (IoT) Development | similarity: 0.2990
  C013 | NoSQL Databases and MongoDB | similarity: 0.2959
