In [1]:
%load_ext dotenv
%dotenv

In [2]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [3]:
files = pd.read_csv("course_section_descriptions.csv", encoding = "ANSI")

In [4]:
files["unique_id"] = files["course_id"].astype(str) + '-' + files["section_id"].astype(str)

In [5]:
files["metadata"] = files.apply(lambda row: {
    "course_name": row["course_name"],
    "section_name": row["section_name"],
    "section_description": row["section_description"],
}, axis = 1)

In [6]:
def create_embeddings(row):
    combined_text = f'''{row["course_name"]} {row["course_technology"]}
                        {row["course_description"]} {row["section_name"]}{row["section_description"]}'''
    return model.encode(combined_text, show_progress_bar = False)

In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
files["embedding"] = files.apply(create_embeddings, axis = 1)

## Upserting data to Pinecone

In [9]:
load_dotenv(find_dotenv(), override = True)

True

In [10]:
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [11]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [12]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

my-index succesfully deleted.


In [13]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

In [14]:
index = pc.Index(index_name)

In [15]:
vectors_to_upsert = [(row["unique_id"], row["embedding"].tolist(), row["metadata"]) for index, row in files.iterrows()  ]

In [16]:
index.upsert(vectors = vectors_to_upsert)
print("Data succesfully upserted to Pinecone index")

Data succesfully upserted to Pinecone index


In [27]:
query = "regression in Python"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [28]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata=True
)

In [29]:
score_threshold = 0.4

In [30]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")

Matched item ID: 37-369, Score: 0.75285089
Course: Machine Learning in Python 
Section: Linear Regression with sklearn 
Description: While there are many libraries that can compute a regression model, the most numerically stable one is sklearn. It is also the preferred choice of many machine learning professionals. In this section, we implement all we know about regressions in this amazing library.
Matched item ID: 36-363, Score: 0.675853431
Course: Python for Finance 
Section: Using Regressions for Financial Analysis 
Description: Understanding rates of return and risk is not all there is about finance. Working with regression analysis is a must, and you will see that Python only helps you to be quicker and more precise when doing such estimations.
Matched item ID: 37-368, Score: 0.636714876
Course: Machine Learning in Python 
Section: Linear Regression 
Description: In this part of the course, we will discuss what the course covers, why you need to learn advanced statistics, what’s t