In [1]:
%load_ext dotenv
%dotenv

In [2]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

In [4]:
files = pd.read_csv("course_section_descriptions.csv", encoding = "ANSI")

In [6]:
# create course section bridge by creating unique ids for course and section
files["unique_id"] = files["course_id"].astype(str) + '-' + files["section_id"].astype(str)

In [7]:
# add a metadata column, upload it to the vector database, and identify courses and sections in the sources
files["metadata"] = files.apply(lambda row: {
    "course_name": row["course_name"],
    "section_name": row["section_name"],
    "section_description": row["section_description"],
}, axis = 1)

In [8]:
# function to cobine text columns into one single column
def create_embeddings(row):
    combined_text = f'''{row["course_name"]} {row["course_technology"]}
                        {row["course_description"]} {row["section_name"]}{row["section_description"]}'''
    return model.encode(combined_text, show_progress_bar = False)

In [10]:
# Invoke embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [11]:
# create file embeddings column
files["embedding"] = files.apply(create_embeddings, axis = 1)

  return forward_call(*args, **kwargs)


# Upserting data to Pincone

In [12]:
load_dotenv(find_dotenv(), override = True)

True

In [13]:
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [17]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [15]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

my-index succesfully deleted.


In [18]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-67t99i9.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [19]:
index = pc.Index(index_name)

In [21]:
vectors_to_upsert = [(row["unique_id"], row["embedding"].tolist(), row["metadata"]) for index, row in files.iterrows()]

In [23]:
index.upsert(vectors = vectors_to_upsert)
print("Data succesfully upserted to Pinecone index")

Data succesfully upserted to Pinecone index


In [24]:
# Ensure you've already initialized and configured Pinecone and the model
# If not, you need to run the initialization code provided earlier

# Create the query embedding
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

  return forward_call(*args, **kwargs)


In [25]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata = True
)

In [26]:
score_threshold = 0.3

In [27]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")

Matched item ID: 51-469, Score: 0.561120212
Course: Machine Learning in Excel 
Section: Cluster Analysis 
Description: Cluster analysis is the most intuitive and important example of unsupervised learning. However, to be able to understand cluster analysis, you must first become familiar with the mathematics behind it. Here we will explore the fundamentals of cluster analysis and have a look at the differences between clustering and classification.
Matched item ID: 37-374, Score: 0.542959213
Course: Machine Learning in Python 
Section: Other Types of Clustering 
Description: In previous sections, we focus extensively on k-means clustering, as it is the fastest and most efficient method for clustering. In this section, we explore other approaches that are less common.
Matched item ID: 51-470, Score: 0.508886516
Course: Machine Learning in Excel 
Section: K-means Clustering 
Description: Master K-means clustering in Excel by learning how to choose the number of clusters in your analysis 

In [28]:
# Ensure you've already initialized and configured Pinecone and the model
# If not, you need to run the initialization code provided earlier

# Create the query embedding
query = "regression in Python"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

  return forward_call(*args, **kwargs)


In [29]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata = True
)

In [30]:
score_threshold = 0.4

In [31]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")

Matched item ID: 37-369, Score: 0.753219903
Course: Machine Learning in Python 
Section: Linear Regression with sklearn 
Description: While there are many libraries that can compute a regression model, the most numerically stable one is sklearn. It is also the preferred choice of many machine learning professionals. In this section, we implement all we know about regressions in this amazing library.
Matched item ID: 36-363, Score: 0.67616713
Course: Python for Finance 
Section: Using Regressions for Financial Analysis 
Description: Understanding rates of return and risk is not all there is about finance. Working with regression analysis is a must, and you will see that Python only helps you to be quicker and more precise when doing such estimations.
Matched item ID: 37-368, Score: 0.637206793
Course: Machine Learning in Python 
Section: Linear Regression 
Description: In this part of the course, we will discuss what the course covers, why you need to learn advanced statistics, what’s t