In [1]:
import langchain
from langchain_community.document_loaders import DataFrameLoader
import json
import pandas as pd
import getpass
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings 
import openai 
from langchain_core.documents import Document

  from tqdm.autonotebook import tqdm


In [2]:
if not os.getenv("pinecone_API"):
    os.environ["pinecone_API"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("pinecone_API")

pc = Pinecone(api_key=pinecone_api_key)

api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key


In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# pc.create_index(
#     name= 'course-title-db',
#     dimension=1536,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="aws", region="us-east-1"),
# )


index = pc.Index("course-title-db")
vector_store = PineconeVectorStore(index , embedding = embeddings)

In [None]:
with open('../data/minidb.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)


df_final = df[['course_title', 'UUID']].dropna()
df_final.head()

Unnamed: 0,UUID,department_abbrv,course_id,credits,quarter_offered,mean_gpa,teacher_rating,start_time,end_time,meeting_days,professor_name,credit_type,course_title,enroll_status
0,LSJ434ASpring2025,LSJ,434,5,Spring 2025,3.57037,,13:30,15:20,MW,Katie Warden,"SSc, DIV",Civil and Human Rights Law for Disabled People,add code required
1,PRSAN513ASpring2025,PRSAN,513,5,Spring 2025,,,11:30,13:20,MW,Elham Monfaredi,,Elementary Persian ...,open
2,PHRMCY514ASpring2025,PHRMCY,514,2,Spring 2025,,,09:30,11:20,Th,Teresa O'Sullivan,,Design and Analysis of Medical Studies,open
3,PSYCH330ASummer2025,PSYCH,330,5,Summer 2025,3.56092,4.3,09:40,11:50,MWF,LomaJohn T Pendergraft,NSc,Laboratory in Animal Behavior,open
4,PSYCH330ASpring2025,PSYCH,330,5,Spring 2025,3.56092,4.3,09:30,11:20,MWF,LomaJohn T Pendergraft,NSc,Laboratory in Animal Behavior,open


In [17]:
df_final = df[['course_title', 'UUID']].dropna()
df_final.head()

Unnamed: 0,course_title,UUID
0,Civil and Human Rights Law for Disabled People,LSJ434ASpring2025
1,Elementary Persian ...,PRSAN513ASpring2025
2,Design and Analysis of Medical Studies,PHRMCY514ASpring2025
3,Laboratory in Animal Behavior,PSYCH330ASummer2025
4,Laboratory in Animal Behavior,PSYCH330ASpring2025


In [18]:
loader = DataFrameLoader(
    df_final,
    page_content_column = 'course_title'
)


documents = loader.load()


In [20]:
documents[0].metadata

{'UUID': 'LSJ434ASpring2025'}

In [21]:
vector_store.add_documents(documents = documents)

['23717473-bf16-41a2-a053-cbd31425c73a',
 '515c1678-b670-47f1-a0a1-e4241d8d7ad9',
 '412b62d8-173a-4a1a-85b1-7a6070ca82a7',
 '4a0ccbdb-7eeb-4b15-b3e1-8c25c40e7c10',
 'ca53ce2f-8a40-4808-817f-1f6e69b3759f',
 'a7306c54-5731-443a-b0cb-0384360c7f16',
 'a8422959-6a4c-4c90-9052-b0093046b8d4',
 '9b476ccd-914c-4a13-8a95-7a8d2000aeb0',
 'd0839c81-54d4-4353-b63b-57d6f7486da0',
 'c9c77309-10db-4bbc-b641-c2132d493b4a',
 '0609f807-ed09-4449-8dbf-a315ce1dc629',
 '2aee17ce-877d-4cd5-a84b-33ed44c38ba6',
 '3a4e3209-8f21-46f0-9fa9-8c66b4d4bd9f',
 '9ad79e5c-2d74-44d7-a2c8-d16750ca6eb9',
 '6eb618f6-430d-4152-b2f9-b8ac409e822e',
 '68ce318c-8b4e-41cc-a2c6-d000587e0e64',
 'bdf16646-c6f9-4a42-948b-9c29c6b25688',
 'dcfb33fb-eb10-47cb-9a1c-f52c47417bbd',
 'a7f700be-83e8-471b-bc5a-a5c014189356',
 'b8b2dd1e-2aab-4fe9-b908-213d3fefbfb3',
 '35f08311-5c56-4633-9fc6-e1c66bbb0786',
 '9bdd9ac4-5588-473b-bea6-10798c47ef7b',
 '51897379-2d83-4370-8a63-ab2b19173874',
 '86ea3e79-32cd-461f-b238-ced91c283c13',
 'efb1cf81-5c43-

In [16]:
course_titles = df[['course_title', 'UUID']].to_dict(orient='records')

course_titles[0]


{'course_title': 'Civil and Human Rights Law for Disabled People',
 'UUID': 'LSJ434ASpring2025'}