### Indexing: Creating a Chroma Vectorstore

In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS 

In [3]:
import pandas as pd
import torch

In [4]:
files = pd.read_csv("course_section_descriptions.csv", encoding='ANSI')

## Create text blobs and unite courses and sections

In [5]:
# Create a dictionary to map course IDs to course names
id_to_name = pd.Series(files.course_name.values,index=files.course_id).to_dict()

### Created weighted versions of the parameters

In [6]:
from sentence_transformers import SentenceTransformer
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import numpy as np

In [8]:
# Aggregate course data
course_agg = files.groupby('course_id').agg({
    'course_name': 'first',
    'course_slug': 'first',
    'course_description': 'first',
    'course_description_short': 'first',
    'course_technology': 'first',
    'course_topic': 'first',
    'course_instructor_quote': 'first',
    'section_name': lambda x: list(x),  # Assuming section_name is not already a string
    'section_description': ' '.join  # Combine section descriptions into one string
}).reset_index()


In [9]:
def create_course_embedding(row):
    # Weights
    weight_course_name = 5
    weight_section_name = 4
    weight_other = 1

    # Helper function to safely encode text or return a zero vector if text is None
    def safe_encode(text, weight):
        if text is None:
            # Return a zero vector if the text is None
            return np.zeros(model.get_sentence_embedding_dimension())
        else:
            # Otherwise, return the encoded text vector multiplied by its weight
            return model.encode(text) * weight

    # Generate embeddings for individual components with weights
    embedding_course_name = safe_encode(row['course_name'], weight_course_name)
    embedding_course_slug = safe_encode(row['course_slug'], weight_other)
    embedding_course_description = safe_encode(row['course_description'], weight_other)
    embedding_course_description_short = safe_encode(row['course_description_short'], weight_other)
    embedding_course_instructor_quote = safe_encode(row['course_instructor_quote'], weight_other)

    # Extract section names for the course from the dataframe (assuming 'df' is your original dataframe)
    section_names = files[files['course_id'] == row['course_id']]['section_name'].unique().tolist()

    # Generate embeddings for section names with weights
    embeddings_section_names = [safe_encode(name, weight_section_name) for name in section_names]

    # If there are no section names, create a zero vector
    if not embeddings_section_names:
        embeddings_section_names = [np.zeros(model.get_sentence_embedding_dimension())]

    # Average the section name embeddings
    embeddings_section_names = np.mean(embeddings_section_names, axis=0)

    # Combine the weighted embeddings into a single composite embedding
    composite_embedding = np.mean([
        embedding_course_name,
        embedding_course_slug,
        embedding_course_description,
        embedding_course_description_short,
        embedding_course_instructor_quote,
        embeddings_section_names
    ], axis=0)
    
    return composite_embedding

# Apply the function to create embeddings for each course
course_agg['embedding'] = course_agg.apply(create_course_embedding, axis=1)


In [10]:
# Now each course has an embedding, you can upsert these into Pinecone
# (Assuming Pinecone is initialized and 'index' is your Pinecone index object)
vectors_to_upsert = [(str(row['course_id']), row['embedding'].tolist()) for index, row in course_agg.iterrows()]


## Connect to Pinecone Index

In [11]:
import os
from pinecone import Pinecone, ServerlessSpec

In [12]:
from dotenv import load_dotenv, find_dotenv

In [13]:
load_dotenv(find_dotenv(), override = True)

True

In [14]:
import pinecone
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [15]:
index = pc.Index("my-index")

In [16]:
index.upsert(vectors=vectors_to_upsert)

print("Data upserted to Pinecone index.")

Data upserted to Pinecone index.


## Query data

In [26]:
# Ensure you've already initialized and configured Pinecone and the model
# If not, you need to run the initialization code provided earlier

# Create the query embedding
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [27]:
query_results = index.query(
   # namespace="my-index",
    vector=[query_embedding],
    top_k=12,
    include_values=True
)

In [28]:
score_threshold = 0.2


# Print the results that meet the score threshold
for match in query_results['matches']:
    if match['score'] >= score_threshold:
           course_name = id_to_name.get(int(match['id']), "Unknown Course")
           print(f"Matched course name: {course_name}, score: {match['score']}")

Matched course name: Machine Learning in Python, score: 0.528213263
Matched course name: Machine Learning in Excel, score: 0.51362443
Matched course name: The Machine Learning Algorithms A-Z, score: 0.492124528
Matched course name: Machine Learning with K-Nearest Neighbors, score: 0.464035869
Matched course name: Linear Algebra and Feature Selection, score: 0.406494081
Matched course name: Customer Analytics in Python, score: 0.391526312
Matched course name: The Machine Learning Process A-Z, score: 0.388478428
Matched course name: Machine Learning with Support Vector Machines, score: 0.364138037
Matched course name: Power Query and Data Modeling, score: 0.35380885
Matched course name: Machine Learning with Naive Bayes, score: 0.350773185
Matched course name: Mathematics, score: 0.350522399
Matched course name: Machine Learning with Decision Trees and Random Forests, score: 0.350382417
