In [1]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
import os
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [6]:
pc = Pinecone(api_key = os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [2]:
import json 
with open("reviews.json", "r") as file:
    data = json.load(file)
data["reviews"]

[{'professor': 'Dr. Emily Watson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Dr. Watson's lectures are engaging and insightful. She makes complex topics easy to understand."},
 {'professor': 'Dr. James Miller',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Great at explaining concepts, but sometimes the pace is a bit fast. Overall, very knowledgeable.'},
 {'professor': 'Dr. Olivia Rodriguez',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Dr. Rodriguez is an excellent professor. Her enthusiasm for physics is contagious!'},
 {'professor': 'Dr. Michael Lee',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Dr. Lee knows his stuff, but his lectures can be a bit dry. More interactive sessions would help.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Dr. Johnson is amazing! She makes the subject very interesting and is always willing to help.'},
 {'professor': 'Dr. William Brown',
  'subject': 'History',
  'stars

In [8]:
import numpy as np

processed_data = []


for review in data["reviews"]:
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embedding = model.encode(review["review"])
    processed_data.append({
        "values": embedding.tolist(),
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [9]:
processed_data

[{'values': [-0.0028287468012422323,
   -0.08071242272853851,
   0.03957352787256241,
   0.04409708455204964,
   -0.08826851844787598,
   0.03207426518201828,
   -0.0007590816239826381,
   0.07068163901567459,
   0.011872401461005211,
   -0.021874986588954926,
   -0.03573675453662872,
   0.12074890732765198,
   -0.02029818296432495,
   0.03645506873726845,
   -0.00886467844247818,
   0.05068610608577728,
   0.026335546746850014,
   -0.07412591576576233,
   -0.0025502443313598633,
   -0.051607754081487656,
   0.001123569323681295,
   0.04344811290502548,
   0.09303796291351318,
   0.014967422932386398,
   -0.05216773971915245,
   -0.0008958404650911689,
   8.169417560566217e-05,
   -0.08210095018148422,
   0.04093453660607338,
   -0.022892218083143234,
   -0.05926256254315376,
   0.05527882277965546,
   -0.02685311809182167,
   0.06324762850999832,
   -0.025810623541474342,
   0.03776577487587929,
   0.016998732462525368,
   0.07555349171161652,
   0.045942436903715134,
   0.01959993503

In [10]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace='ns1'
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}