In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [5]:
# Create a Pinecone index
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [7]:
# Load the Data
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Smith is very knowledgeable and makes the subject easy to understand.'},
 {'professor': 'Prof. Emily Brown',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Prof. Brown is thorough but can be a bit fast-paced.'},
 {'professor': 'Dr. Alan Green',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Dr. Green is okay, but his lectures can be dry.'},
 {'professor': 'Prof. Laura White',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'Prof. White explains concepts well, but her tests are tough.'},
 {'professor': 'Dr. Robert Black',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Dr. Black is amazing! His passion for biology is contagious.'},
 {'professor': 'Prof. Linda Grey',
  'subject': 'English Literature',
  'stars': 4,
  'review': 'Prof. Grey is insightful but sometimes expects too much.'},
 {'professor': 'Dr. William Blue',
  'subject': 'History',
  'stars': 3,
  'review': 'Dr

In [10]:
# Process the data
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"], 
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [11]:
# Display the processed data
processed_data[0]

{'values': [0.0021122096,
  -0.009777354,
  -0.0470936,
  0.019188222,
  0.029188085,
  0.0013056105,
  0.0007112959,
  0.017866248,
  0.009338878,
  -0.03738169,
  0.014764197,
  -0.027093872,
  -0.013704002,
  0.008586271,
  0.057486128,
  0.0041655195,
  -0.021387143,
  -0.008965847,
  0.029947236,
  0.027408004,
  0.0705226,
  -0.010366351,
  0.023049425,
  -0.051962644,
  -0.028193334,
  -0.04476379,
  0.025483947,
  0.02968546,
  -0.008553549,
  -0.00068757246,
  0.06471116,
  -0.013141182,
  0.00544168,
  0.020025905,
  -0.016832232,
  0.010510328,
  -0.029161908,
  0.0051144594,
  -0.019253666,
  -0.002941714,
  0.042355444,
  0.00859936,
  -0.04363815,
  -0.009495944,
  0.047695685,
  -0.011740678,
  -0.0094305,
  -0.022565138,
  0.018363625,
  0.05198882,
  -0.06073216,
  0.03125612,
  0.04442348,
  -0.05081083,
  -0.05222442,
  0.049030747,
  0.030941987,
  0.04643916,
  0.007499898,
  -0.057067286,
  0.06450174,
  -0.003354012,
  -0.00904438,
  -0.029057197,
  -0.029633105,

In [12]:
# Insert the data into the Pinecone index
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [13]:
#Display your Pinecone Index Stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}