In [15]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [16]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
try:
    pc.create_index(
        name="rag", 
        metric="cosine",
        spec = ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
        # dimension=768,
        dimension=1536
    )
    print("Index created.")
except Exception as e:
    print(f"Error creating index: {e}. If the index already exists, you can ignore this message.")

Index created.


In [17]:
import json
with open("reviews.json") as file:
    data = json.load(file)
data['reviews']

[{'professor': 'Dr. Emily Hartwell',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and challenging assignments. Dr. Hartwell really knows her stuff!'},
 {'professor': 'Prof. Michael Chang',
  'subject': 'Literature',
  'stars': 5,
  'review': "Absolutely brilliant! Prof. Chang's passion for literature is contagious."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Decent instructor, but the course material could be more organized.'},
 {'professor': 'Prof. Robert Garcia',
  'subject': 'History',
  'stars': 4,
  'review': 'Fascinating insights into historical events. Prof. Garcia makes history come alive.'},
 {'professor': 'Dr. Lisa Thompson',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Difficult to follow lectures. More office hours would be helpful.'},
 {'professor': 'Prof. David Wilson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Exceptional teacher. Prof. Wilson breaks down complex concepts with ease

In [18]:
processed_data = []
client = OpenAI()
for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [19]:
processed_data[0]

{'values': [-0.020235642790794373,
  -0.03564247488975525,
  0.015353476628661156,
  0.052796751260757446,
  0.04135167598724365,
  -0.009510884992778301,
  0.011325023137032986,
  0.03449530154466629,
  0.003188080620020628,
  0.008183630183339119,
  0.03396173194050789,
  0.02411736361682415,
  -0.01575365476310253,
  -0.014966638758778572,
  0.017981309443712234,
  0.026971962302923203,
  -0.011531781405210495,
  0.0010971532901749015,
  0.020875925198197365,
  0.03548240289092064,
  0.03542904928326607,
  -0.03428187221288681,
  0.05130275338888168,
  -0.006996436510235071,
  -0.03516226261854172,
  -0.045620232820510864,
  0.009510884992778301,
  -0.004011779557913542,
  -0.013652722351253033,
  0.013499320484697819,
  0.0666428953409195,
  -0.0209026038646698,
  0.017447737976908684,
  -0.024824343621730804,
  -0.014966638758778572,
  0.09081361442804337,
  -0.005375717766582966,
  0.04185856506228447,
  0.029559778049588203,
  0.02963981404900551,
  0.03638947382569313,
  -0.020

In [20]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [22]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}