In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Excellent professor! His lectures are engaging and very informative.'},
 {'professor': 'Dr. Emily White',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Great teacher but sometimes goes too fast. Overall, learned a lot.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Decent professor, but his grading is very strict.'},
 {'professor': 'Dr. Linda Johnson',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Very approachable and always willing to help. Highly recommend!'},
 {'professor': 'Dr. William Davis',
  'subject': 'History',
  'stars': 4,
  'review': 'Interesting lectures, but the exams are tough.'},
 {'professor': 'Dr. Jennifer Wilson',
  'subject': 'Biology',
  'stars': 2,
  'review': "The material was difficult to follow, and the professor wasn't very helpful."},
 {'professor': 'Dr. Robert Martinez',
  'subject': 'English',
  'stars': 5

In [4]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [5]:
processed_data[0]

{'values': [-0.018094143,
  -0.029851275,
  -0.054270823,
  0.018337874,
  -0.022643793,
  -0.009818887,
  0.010735781,
  0.044359088,
  -0.007752975,
  -0.008437743,
  -0.009110906,
  -0.032474287,
  -0.0026824945,
  0.0041608405,
  0.02289913,
  0.0128133,
  -0.015888955,
  -0.026694374,
  0.022411669,
  0.035886522,
  0.013416825,
  -0.036095437,
  0.021587625,
  0.0310119,
  -0.07423357,
  -0.0625809,
  -0.0043146233,
  0.04280385,
  0.02906205,
  0.020160055,
  0.079247475,
  -0.005954006,
  -0.0170844,
  -0.036350776,
  -0.020078812,
  0.020380575,
  -0.016689787,
  0.03270641,
  -0.012151743,
  0.01643445,
  0.041944984,
  -0.023328561,
  -0.018860156,
  0.012964181,
  0.060584623,
  -0.040250473,
  -0.00206156,
  -0.0415968,
  0.054177973,
  0.032845687,
  -0.039879072,
  0.014136412,
  0.0497676,
  0.017815594,
  -0.043430585,
  0.021413531,
  0.00084290386,
  0.034749113,
  -0.008501578,
  -0.034749113,
  0.037627462,
  0.009923344,
  0.0014783461,
  0.0041347262,
  -0.027808

In [6]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [7]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}