In [14]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name = "rag", dimension = 1536, metric = "cosine", spec = ServerlessSpec(cloud = "aws", region = "us-east-1")

)

In [12]:
import json
data = json.load(open("reviews.json"))
data['reviews']


[{'professor': 'Dr. Jane Smith',
  'subject': 'Introduction to Computer Science',
  'stars': 5,
  'review': 'Dr. Smith is an amazing professor! Her lectures are clear and sheâ€™s always willing to help.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Advanced Algorithms',
  'stars': 4,
  'review': 'Great class! Dr. Doe explains complex topics well, but the workload is heavy.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Database Systems',
  'stars': 3,
  'review': 'Good professor, but the lectures can be a bit dry at times.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Software Engineering',
  'stars': 5,
  'review': 'Dr. Brown is an excellent professor! He makes sure everyone understands the material.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Operating Systems',
  'stars': 4,
  'review': 'Challenging course but Dr. Davis is very supportive.'},
 {'professor': 'Dr. Robert Wilson',
  'subject': 'Discrete Mathematics',
  'stars': 2,
  'review': 'The lectures were co

In [16]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",
    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [17]:
processed_data[0]

{'values': [-0.011399935,
  0.000553636,
  -0.019399172,
  0.038143914,
  0.005895712,
  0.0032575417,
  0.01909533,
  0.002180946,
  -0.008495903,
  -0.03463804,
  0.0029946011,
  -0.023033595,
  -0.018394154,
  0.00475046,
  0.010564368,
  0.033422668,
  -0.012527658,
  0.0074733556,
  0.00067889795,
  0.042537943,
  0.027883388,
  -0.011388249,
  0.020754777,
  -0.03529247,
  -0.012282247,
  -0.031693105,
  0.005016322,
  0.034427688,
  0.015811494,
  0.009676213,
  0.066658355,
  -0.003327659,
  -0.0047475384,
  -0.00697669,
  -0.037279133,
  0.030501107,
  -0.033282433,
  -0.0026630037,
  -0.0076544923,
  -0.0021648775,
  0.006024261,
  0.027088722,
  -0.0060593197,
  -0.0016258493,
  0.04232759,
  -0.015449219,
  -0.022788184,
  0.00060293736,
  0.019960111,
  0.043963663,
  -0.039312538,
  0.015799807,
  0.03223067,
  -0.021759793,
  -0.06418087,
  0.03353953,
  0.04055128,
  0.054177444,
  0.017926704,
  -0.02912213,
  0.020392504,
  0.020544425,
  -0.02279987,
  -0.02715884,
 

In [18]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace = "ns1",
    
)

{'upserted_count': 20}

In [19]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}