In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [3]:
pinecone_ = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index_name = 'rag'
namespace = ''
pinecone_.create_index(
    name=index_name,
    dimension=1536,     # This is openAI embeddings number
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [4]:
import json
data = json.load(open('reviews_example.json'))
data['reviews']

[{'professor': 'Dr. Emily Carter',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Amazing professor! She explains complex concepts in a very simple way and is always available for questions.'},
 {'professor': 'Dr. John Smith',
  'subject': 'History',
  'stars': 3,
  'review': 'His lectures are informative but can be a bit dry. Be prepared to read a lot.'},
 {'professor': 'Dr. Samantha Lee',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Very knowledgeable and patient. Sometimes goes too fast, but she is willing to help after class.'},
 {'professor': 'Dr. Robert Jones',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Difficult to follow his lectures, and his exams are extremely tough. Not the best for beginners.'},
 {'professor': 'Dr. Angela Martinez',
  'subject': 'Biology',
  'stars': 5,
  'review': 'One of the best professors I’ve ever had! She makes the subject fascinating and engages everyone in class.'},
 {'professor': 'Dr. Mark Davis',
  'subject': 'Philosophy',


In [5]:
process_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model='text-embedding-3-small',
    )
    embedding = response.data[0].embedding
    process_data.append({
        'values': embedding,
        'id': review['professor'],
        'metadata': {
            'review': review['review'],
            'subject': review['subject'],
            'stars': review['stars']
        }
    })

In [7]:
index = pinecone_.Index('rag')
index.upsert(
    vectors=process_data,
    namespace=namespace
)

{'upserted_count': 20}

In [8]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}