In [6]:
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import os

In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [13]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Great professor! His lectures are clear and engaging. Highly recommend.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Good explanations, but sometimes the material is too advanced.'},
 {'professor': 'Dr. Mark Williams',
  'subject': 'Physics',
  'stars': 3,
  'review': 'The professor is knowledgeable, but his lectures can be a bit dry.'},
 {'professor': 'Dr. Susan Brown',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'She explains concepts well but can be strict with grading.'},
 {'professor': 'Dr. Michael Davis',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Fantastic lecturer and very approachable. I learned a lot!'},
 {'professor': 'Dr. Linda Wilson',
  'subject': 'History',
  'stars': 2,
  'review': 'Lectures are too fast-paced and difficult to follow.'},
 {'professor': 'Dr. James Lee',
  'subject': 'Economics',
  'stars': 3,
  'rev

In [9]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [None]:
processed_data[0]

In [11]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}