In [21]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [22]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.delete_index(name="rag")
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [23]:
import json
data=json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Dr. Smith explains difficult concepts very clearly and is always willing to help during office hours.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Excellent professor! Makes learning physics fun and engaging.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Dr. Brown has a lot of knowledge, but his lectures can be a bit dry at times.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Very knowledgeable and approachable. I learned a lot in her class.'},
 {'professor': 'Dr. William Wilson',
  'subject': 'Computer Science',
  'stars': 2,
  'review': "The course material was interesting, but Dr. Wilson's teaching style left much to be desired."},
 {'professor': 'Dr. Olivia Martin',
  'subject': 'English',
  'stars': 5,
  'review': 'Dr. Martin is passionate about literature and it 

In [25]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [26]:
processed_data[0]

{'values': [-0.03540482,
  0.012937372,
  -0.00058269897,
  0.026210947,
  0.023456663,
  -0.0048167626,
  0.016758455,
  0.01653863,
  -0.0023695885,
  -0.0291204,
  0.0066400203,
  -0.0047456427,
  -0.0076357,
  -0.029663498,
  0.03863755,
  0.026081637,
  -0.0029757246,
  0.010196019,
  -0.0029676429,
  0.038585823,
  0.034551382,
  0.0032359592,
  0.035327237,
  -0.010409379,
  -0.045594376,
  -0.0026702322,
  0.018038614,
  0.038042724,
  0.014534338,
  -0.011540834,
  0.064861424,
  -0.01739207,
  0.01713345,
  -0.0021239014,
  -0.06377523,
  0.03667205,
  -0.02068945,
  0.017728273,
  0.0068469145,
  0.0017569868,
  0.019629115,
  0.03625826,
  -0.031577274,
  -0.0060322676,
  0.029611776,
  -0.00777794,
  -0.020327384,
  -0.036206536,
  0.021245478,
  0.03137038,
  -0.04763746,
  0.028370408,
  0.033180706,
  -0.023767006,
  -0.052732233,
  0.027853172,
  0.024581652,
  0.025810089,
  0.019486876,
  -0.031447962,
  0.059740786,
  0.0056799003,
  -0.017017072,
  -0.023379078,
  

In [27]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [28]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}