In [1]:
import langchain
from langchain_community.document_loaders import DataFrameLoader
import json
import pandas as pd
import getpass
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings 
import openai 
from langchain_core.documents import Document

  from tqdm.autonotebook import tqdm


In [2]:
if not os.getenv("pinecone_API"):
    os.environ["pinecone_API"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("pinecone_API")

pc = Pinecone(api_key=pinecone_api_key)

api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key


In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

index = pc.Index("course-description-db")
vector_store = PineconeVectorStore(index , embedding = embeddings)

In [7]:
vector_store.delete(delete_all=True)

In [6]:
df = pd.read_csv('../data/df_lectures.csv')

df_renamed = df.rename(columns = {'Course Description' : 'course_description',
                                  'Course Code' : 'course_id'})
df_final = df_renamed[['course_id', 'course_description', 'UUID']]

print(df_final.head())
print(df_final.shape)

    course_id                                 course_description  \
0  AFRAM  150  Introductory survey of topics and problems in ...   
1  AFRAM  272  Reconstruction and its aftermath, the Agrarian...   
2  AFRAM  318  Considers how generic forms and conventions ha...   
3  AFRAM  330  Focuses on cultural expressions created by peo...   
4  AES    150  Focusing on pre-Columbus era to 1970, students...   

                  UUID  
0  AFRAM150ASpring2025  
1  AFRAM272ASpring2025  
2  AFRAM318ASpring2025  
3  AFRAM330ASpring2025  
4    AES150ASummer2025  
(6198, 3)


In [12]:
df_final.dropna(subset=['course_description'], inplace=True)
print(df_final.shape)

(5928, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.dropna(subset=['course_description'], inplace=True)


In [13]:
print(df_final.isna().sum())

course_id             0
course_description    0
UUID                  0
dtype: int64


In [None]:
loader = DataFrameLoader(
    df_final,
    page_content_column = 'course_description'
)


documents = loader.load()


In [15]:
vector_store.add_documents(documents = documents)

['fd26d79b-0e2d-4cae-9f8e-ee392b143e26',
 '6b670a05-db79-426c-badf-f04200c8fc53',
 '7b11f917-3711-4643-bc6f-b4a712c4052a',
 '13705a4a-15d2-4d50-b5f9-3e9957332fd8',
 '87bd9603-a41f-466c-9d6d-e988db181ca8',
 '2bc347b0-574e-469c-9fd1-57d72b8a1ed7',
 'b48f1f7b-fe8a-424b-a01b-349ed84f58d3',
 'f39f6ea9-7114-41d9-b9b0-954849748d98',
 '4d99d162-3783-4b68-ab3f-8dc7de35909a',
 '4500a34b-bfc8-4e22-b1a5-14186fbcc4ec',
 'bace5cf6-fea9-41b9-9686-a46a323db64e',
 '893671e7-6c00-4a56-a60d-5c948e79c2e5',
 '40976eba-6255-4b12-9bed-d74d6cc3643a',
 '5ccb8dea-f3c4-48a2-83dc-d87c1b5de038',
 '78a03925-f582-4ea9-8cd6-c5e4aaecf30b',
 '044fea9b-fd14-45fe-adcc-8d6b24eb8065',
 '0e6c8029-3556-41df-a6c8-e2bf278102c5',
 '0b809106-be08-4746-8ce1-88f9c9f28008',
 '8e37c143-7376-47a0-8436-40a5a21f86e5',
 '8dd70109-37a4-42ca-ad09-fca41e775612',
 '71287d3b-ff07-43c8-9c06-afcd038ea9dc',
 'e9f44653-dcbd-4488-9e83-22b3343ee587',
 '9765c68b-84ad-4fb9-9d10-68a5a736fd3d',
 'c6da4fa4-3ee0-4298-af08-28d28363a77b',
 'bc5300a6-257e-