In [1]:
import langchain
from langchain_community.document_loaders import DataFrameLoader
import json
import pandas as pd
import getpass
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings 
import openai 
from langchain_core.documents import Document

  from tqdm.autonotebook import tqdm


In [2]:
if not os.getenv("pinecone_API"):
    os.environ["pinecone_API"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("pinecone_API")

pc = Pinecone(api_key=pinecone_api_key)

api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key


In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

index = pc.Index("course-description-db")
vector_store = PineconeVectorStore(index , embedding = embeddings)

In [10]:
df = pd.read_csv('../data/all_data.csv')

df_renamed = df.rename(columns = {'Course Description' : 'course_description',
                                  'Course Code' : 'course_id'})
df_final = df_renamed[['course_id', 'course_description', 'UUID']]

print(df_final.head())
print(df_final.shape)

    course_id                                 course_description  \
0  AFRAM  150  Introductory survey of topics and problems in ...   
1  AFRAM  272  Reconstruction and its aftermath, the Agrarian...   
2  AFRAM  318  Considers how generic forms and conventions ha...   
3  AFRAM  330  Focuses on cultural expressions created by peo...   
4  AES    150  Focusing on pre-Columbus era to 1970, students...   

                  UUID  
0  AFRAM150ASpring2025  
1  AFRAM272ASpring2025  
2  AFRAM318ASpring2025  
3  AFRAM330ASpring2025  
4    AES150ASummer2025  
(5018, 3)


In [16]:
df_final.dropna(subset=['course_description'], inplace=True)
print(df_final.shape)

(4821, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.dropna(subset=['course_description'], inplace=True)


In [17]:
loader = DataFrameLoader(
    df_final,
    page_content_column = 'course_description'
)


documents = loader.load()


In [18]:
vector_store.add_documents(documents = documents)

['58b2b209-c556-47fd-bdcf-bf3afc51cb15',
 'e1bad28e-4e48-4e75-b0e1-fdc04a5873de',
 'b19694c7-ac50-4f27-9d60-307fe058eee9',
 '5043985f-f63a-42d8-9b83-1e5f394e70c5',
 '0eacc1b8-e632-43d1-a63d-7df1ec25daad',
 '42d98ca2-12da-4473-bc9b-4952dc6f5b2c',
 '45aebc04-3e93-4001-ac2d-b3a68cc54902',
 'f8d52b1f-3a4d-4cd3-88f4-ce956daf366d',
 '0fc1a8fb-f3a0-4300-b4b9-424aaf618872',
 'a51b4a06-789d-4657-8d01-d8078c07cf73',
 '693dd076-1222-4748-9ea3-50f82f0362f5',
 '8a89ebbb-0946-4a91-a1cd-d121e29e26a4',
 '701c9dac-2728-48ab-9bf9-c3bc0b3267d9',
 'cfaf98a0-e7a2-4cc3-8d46-29ce1f3e0651',
 '936e4bf2-6194-4890-9860-6b56e64d0321',
 '6ab2f387-c9af-4fc1-ad96-19311dcfff60',
 'c815847c-435f-4eff-88e5-0a7a7a2367c2',
 '34245e05-c031-4a8c-9c62-db529a7c3636',
 '21466385-9c4d-4f17-8f94-c94c1f9eb651',
 'c88259ac-e7c1-4b4f-a1a4-94cd1d04d4ee',
 'ada90517-968d-4a02-81cd-bbb117b50e63',
 '8a993aa2-15a1-4859-883f-c39a707e937e',
 'f7fb5a90-41d9-4658-aa8c-933c203012f6',
 '6401802e-a78f-4e1f-87b0-9723c0f730cd',
 '9a51cef4-4efe-