In [28]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [29]:
# Load environment variables
load_dotenv()

# Initialize Pinecone instance
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index if it doesn't already exist
if "rag" not in pc.list_indexes().names():
    pc.create_index(
        name="rag",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [30]:
import json
data = json.load(open("reviews.json"))
data['professors']

[{'name': 'Dr. Jane Smith',
  'department': 'Computer Science',
  'courses_taught': [{'course_code': 'CS101',
    'course_name': 'Introduction to Computer Science',
    'semester': 'Fall 2023',
    'rating': 4.8,
    'difficulty': 3.0,
    'attendance_required': True,
    'review': 'Dr. Smith is an excellent professor who explains concepts clearly. Her lectures are engaging, and she is always available for extra help.'},
   {'course_code': 'CS202',
    'course_name': 'Data Structures',
    'semester': 'Spring 2024',
    'rating': 4.5,
    'difficulty': 4.0,
    'attendance_required': True,
    'review': 'The course is challenging, but Dr. Smith makes it manageable. Her assignments are tough but fair, and she provides great feedback.'}]},
 {'name': 'Dr. John Doe',
  'department': 'Mathematics',
  'courses_taught': [{'course_code': 'MATH201',
    'course_name': 'Calculus I',
    'semester': 'Fall 2023',
    'rating': 3.7,
    'difficulty': 4.5,
    'attendance_required': False,
    'revi

In [36]:
processed_data = []
client = OpenAI()

# Use a valid model name
model_name = "text-embedding-ada-002"  # or another valid model

# Iterate over each professor in the JSON data
for professor in data['professors']:
    # Iterate over each course the professor has taught
    for course in professor['courses_taught']:
        # Create an embedding for the course review
        response = client.embeddings.create(
            input=course['review'],
            model=model_name
        )
        embedding = response.data[0].embedding
        
        # Append the processed data with embedding and metadata
        processed_data.append({
            "values": embedding,
            "id": professor["name"],  # Use the professor's name as the ID
            "metadata": {
                "review": course["review"],
                "subject": course["course_name"],  # Use the course name as the subject
            }
        })

# 'procesed_data' now contains the embeddings and metadata
    

In [37]:
processed_data[0]

{'values': [-0.015942147,
  0.01887818,
  0.0029038403,
  -0.035515703,
  0.0128451465,
  0.02270275,
  -0.0007987975,
  -0.0036346293,
  -0.020835536,
  -0.024209399,
  -0.014113565,
  0.010037887,
  0.0037086739,
  0.017397286,
  0.00017313987,
  -0.019586435,
  0.024788879,
  0.007835861,
  0.014062055,
  -0.007958196,
  -0.034305234,
  0.0019782814,
  -0.005105865,
  0.005054356,
  -0.01713974,
  0.012104699,
  0.01408781,
  -0.0051670326,
  -0.014602903,
  -0.00024909605,
  0.0024048432,
  0.003525172,
  -0.032013066,
  -0.032399386,
  0.007822984,
  0.0024483043,
  -0.017435918,
  0.0094455285,
  0.0010543322,
  0.0030293942,
  0.0054310183,
  -0.016779173,
  0.007835861,
  0.0068185516,
  -0.011911539,
  -0.0066318302,
  -0.0040434846,
  -0.033815894,
  -0.018182805,
  0.010514348,
  0.0133924335,
  0.019045586,
  -0.014461253,
  -0.019354641,
  -0.0060877623,
  -0.013141326,
  -0.012915972,
  0.017925257,
  0.012130454,
  -0.019985631,
  -0.0031678258,
  -0.011293427,
  -0.0234

In [38]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 8}

In [39]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 4}},
 'total_vector_count': 4}