In [None]:
%load_ext dotenv
%dotenv

In [None]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
import pinecone
from sentence_transformers import SentenceTransformer

In [None]:
file = pd.read_csv("../resources/course_descriptions.csv", encoding="ANSI")

In [None]:
file

In [None]:
def create_course_description(row):
    return f"""The course name is {row['course_name']}, the slug is {row["course_slug"]}, the technology is {row["course_technology"]} and the course topic is {row['course_topic']}"""

In [None]:
file['course_description'] = file.apply(create_course_description, axis=1)

In [None]:
print(file['course_description'][0])

In [None]:
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [None]:
index_name = "my-index"
dimension=384
metric="cosine"

In [None]:
#pc.create_index(name=index_name, dimension=dimension, metric=metric, spec=ServerlessSpec(cloud='aws', region='us-east-1'))

In [None]:
index = pc.Index(index_name)

## Embedding the data

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def create_embeddings(row):
    combined_text = " ".join([str(row[field]) for field in ["course_description", "course_description_short"]])
    embedding = model.encode(combined_text, show_progress_bar=False)
    return embedding

In [None]:
file['embedding'] = file.apply(create_embeddings, axis=1)

In [None]:
vectors_to_upsert = [(str(row['course_name']), row['embedding'].tolist()) for _, row in file.iterrows()]
index.upsert(vectors=vectors_to_upsert)
print("Data upserted to Pinecone index")

## Semantic Search

In [None]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [None]:
query_results = index.query(vector=[query_embedding], top_k=12, include_values=True)

In [None]:
query_results

In [None]:
for match  in query_results["matches"]:
    if match['score'] >= 0.3:
        print(f"Matched item ID: {match['id']}, score: {match['score']}")

In [None]:
file = pd.read_csv("../resources/course_section_descriptions.csv", encoding="ANSI")

In [None]:
file['unique_id'] = file['course_id'].astype(str)+ '-' + file['section_id'].astype(str)

In [None]:
file['metadata'] = file.apply(lambda row: {
    "course_name" : row['course_name'],
    "section_name" : row['section_name'],
    "section_description" : row['section_description'],
}, axis=1)

In [None]:
file['metadata']

In [None]:
def create_embeddings(row):
    combined_text = f'''{row["course_name"]} {row["course_technology"]}
                        {row["course_description"]} {row["section_name"]}{row["section_description"]}'''
    return model.encode(combined_text, show_progress_bar = False)

In [None]:
file["embedding"] = file.apply(create_embeddings, axis = 1)

In [None]:
vectors_to_upsert = [(row["unique_id"], row["embedding"].tolist(), row["metadata"]) for index, row in file.iterrows()  ]

In [None]:
index.upsert(vectors=vectors_to_upsert)
print("Data is successfully upserted")

In [None]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata=True
)

In [None]:
score_threshold = 0.5

In [None]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}\n")