In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
pc.create_index(
    name='rag', dimension=1536, metric='cosine', spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [6]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Elena Martinez',
  'subject': 'Organic Chemistry',
  'stars': 4,
  'review': 'Dr. Martinez explains complex concepts clearly. Her enthusiasm for the subject is contagious.'},
 {'professor': 'Prof. John Smith',
  'subject': 'World History',
  'stars': 3,
  'review': 'Lectures can be dry, but the content is interesting. More interactive sessions would help.'},
 {'professor': 'Dr. Samantha Lee',
  'subject': 'Artificial Intelligence',
  'stars': 5,
  'review': "Brilliant professor! Dr. Lee's practical approach and industry insights are invaluable."},
 {'professor': 'Prof. Michael Brown',
  'subject': 'English Literature',
  'stars': 2,
  'review': 'Assignments are unclear and grading seems arbitrary. Struggles to engage the class.'},
 {'professor': 'Dr. Ahmed Hassan',
  'subject': 'Quantum Physics',
  'stars': 4,
  'review': "Challenging course, but Dr. Hassan's patience and clear explanations make it manageable."},
 {'professor': 'Prof. Linda Chen',
  'subject': 'Mark

In [11]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }

    })

In [12]:
processed_data[0]

{'values': [-0.009991659,
  0.0026676098,
  -0.052429054,
  0.057180524,
  0.018557876,
  0.010134202,
  0.016725168,
  0.08150804,
  -0.033233125,
  -0.05517133,
  0.020227678,
  0.020580644,
  -0.0027558515,
  -0.029947825,
  0.040156692,
  0.0051281913,
  -0.026350284,
  -0.026852582,
  0.004887224,
  0.010371776,
  0.026648948,
  -0.02897038,
  0.020553494,
  0.02943195,
  -0.043985017,
  -0.04485386,
  0.006014001,
  0.018571451,
  0.017580431,
  0.010208868,
  0.068746954,
  -0.024965571,
  0.009652268,
  -0.040265296,
  -0.045396883,
  0.04534258,
  -0.040862624,
  0.0017801035,
  -0.002377431,
  0.007018597,
  0.013779257,
  0.01658941,
  -0.042844664,
  0.018829389,
  0.02943195,
  -0.017200314,
  -0.033694696,
  -0.0088648815,
  0.03290731,
  0.033667546,
  -0.06673776,
  0.0048702545,
  0.045016766,
  0.00039687485,
  -0.051967483,
  0.044663798,
  0.034780744,
  0.045016766,
  0.01839497,
  -0.05055562,
  0.056148775,
  -0.023852369,
  0.017960548,
  0.015924206,
  -0.04319

In [13]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}