In [9]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [6]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [8]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Data Structures',
  'stars': 5,
  'review': 'Dr. Smith explains complex topics very clearly and is always available for extra help.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Thermodynamics',
  'stars': 4,
  'review': 'Knowledgeable and engaging, but sometimes hard to follow during lectures.'},
 {'professor': 'Dr. Michael Johnson',
  'subject': 'Calculus II',
  'stars': 3,
  'review': 'Good professor, but grading is very tough and the exams are difficult.'},
 {'professor': 'Dr. Linda Martinez',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Her lectures are interesting and she encourages student participation.'},
 {'professor': 'Dr. Robert Lee',
  'subject': 'Physics I',
  'stars': 2,
  'review': 'The class is very challenging and the professor is not very approachable.'},
 {'professor': 'Dr. Jennifer Wilson',
  'subject': 'Organic Chemistry',
  'stars': 5,
  'review': "One of the best professors I've ever ha

In [13]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input = review["review"],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "reveiew": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })
    


In [14]:
processed_data[0]

{'values': [-0.031023651361465454,
  0.0026497626677155495,
  -0.01803441345691681,
  0.01898212544620037,
  0.02115629054605961,
  0.003370999824255705,
  0.02355344593524933,
  0.03236159682273865,
  0.00455738278105855,
  -0.028765864670276642,
  0.023692814633250237,
  -0.017156384885311127,
  -0.017128512263298035,
  -0.011902155354619026,
  0.028361693024635315,
  0.024292103946208954,
  -0.026981934905052185,
  -0.002329212846234441,
  -0.008236736990511417,
  0.047970980405807495,
  0.0234837606549263,
  -0.00995098240673542,
  0.04239620268344879,
  -0.012320263311266899,
  -0.04019416496157646,
  -0.015330645255744457,
  0.019386297091841698,
  0.01931661181151867,
  0.015874186530709267,
  -0.008215831592679024,
  0.06293926388025284,
  -0.02135140635073185,
  -0.0039511253125965595,
  0.008626972325146198,
  -0.04457036405801773,
  0.015219149179756641,
  -0.020389758050441742,
  0.019037874415516853,
  -0.0032734409905970097,
  0.010703577660024166,
  0.026563826948404312,

In [15]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="nsl"
)

{'upserted_count': 20}

In [16]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'nsl': {'vector_count': 20}},
 'total_vector_count': 20}