In [8]:
# To install packages: pip install -r requirements.txt
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
import openai
from pinecone import Pinecone, ServerlessSpec

In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [11]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Sarah Johnson',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Dr. Johnson is an amazing professor! Her lectures are clear, engaging, and she always encourages class participation.'},
 {'professor': 'Prof. Michael Smith',
  'subject': 'Advanced Calculus',
  'stars': 4,
  'review': 'The content is challenging, but Prof. Smith explains complex topics in a simple way. However, the exams are tough.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Organic Chemistry',
  'stars': 3,
  'review': 'Dr. Davis is knowledgeable, but her lectures can be hard to follow. Be prepared to study a lot on your own.'},
 {'professor': 'Prof. Robert Brown',
  'subject': 'World History',
  'stars': 2,
  'review': 'Prof. Brown has a lot of knowledge, but his lectures are very monotonous. It’s easy to lose focus.'},
 {'professor': 'Dr. Laura Wilson',
  'subject': 'Sociology 101',
  'stars': 5,
  'review': 'Dr. Wilson is fantastic! She is approachable and makes sociolo

In [16]:
openai.api_key=os.getenv("OPENAI_API_KEY")

processed_data = []

# embeddings (searches how closesly related text is to one another)
for review in data['reviews']:
    response = openai.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })


In [17]:
processed_data[0]

{'values': [0.008644465,
  -0.0051777363,
  0.0016886515,
  0.05656461,
  0.013103817,
  -0.00445339,
  0.023036009,
  0.051127538,
  0.0037379859,
  0.0037260626,
  0.025587616,
  -0.01490425,
  -0.032002404,
  0.035364803,
  -0.0050763874,
  0.03956184,
  -0.026541488,
  0.008036371,
  0.03798795,
  0.05923545,
  0.021962903,
  -0.01777779,
  0.010319702,
  -0.023012161,
  -0.045404308,
  -0.03495941,
  -0.0034697093,
  0.022726001,
  0.033504754,
  -0.0037677945,
  0.08465614,
  0.0070705763,
  -0.031024687,
  -0.020877874,
  -0.024037573,
  0.027686134,
  -0.0038482773,
  -0.010248162,
  -0.004128477,
  0.00083314755,
  -0.009192942,
  -0.014331927,
  0.0053387024,
  0.0019748132,
  0.037892565,
  0.00086668215,
  -0.008429844,
  -0.040277246,
  0.04871901,
  0.035960976,
  -0.028878475,
  -0.000845071,
  0.044736598,
  -0.028568467,
  -0.036938693,
  -0.009544682,
  0.016501985,
  0.0450943,
  -0.019697458,
  -0.018636275,
  0.03598482,
  -0.004548777,
  -0.01978092,
  -0.00198673

In [19]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}