In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from groq import Groq
from pinecone import Pinecone, ServerlessSpec

In [9]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
try:
    pc.create_index(
        name="rag", 
        metric="cosine",
        spec = ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
        dimension=768,
        # dimension=1536
    )
    print("Index created.")
except Exception as e:
    print(f"Error creating index: {e}. If the index already exists, you can ignore this message.")

Index created.


In [10]:
import json
with open("reviews.json") as file:
    data = json.load(file)
data['reviews']

[{'professor': 'Dr. Emily Hartwell',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and challenging assignments. Dr. Hartwell really knows her stuff!'},
 {'professor': 'Prof. Michael Chang',
  'subject': 'Literature',
  'stars': 5,
  'review': "Absolutely brilliant! Prof. Chang's passion for literature is contagious."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Decent instructor, but the course material could be more organized.'},
 {'professor': 'Prof. Robert Garcia',
  'subject': 'History',
  'stars': 4,
  'review': 'Fascinating insights into historical events. Prof. Garcia makes history come alive.'},
 {'professor': 'Dr. Lisa Thompson',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Difficult to follow lectures. More office hours would be helpful.'},
 {'professor': 'Prof. David Wilson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Exceptional teacher. Prof. Wilson breaks down complex concepts with ease

In [11]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)


processed_data = []
for review in data['reviews']:
    # Encode the review text
    inputs = tokenizer(review['review'], return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = embedding_model(**inputs)
    
    # Assuming we take the mean of the last hidden state to get a single vector per review
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [12]:
# user_query = "user query"
# inputs = tokenizer(user_query, return_tensors="pt", padding=True, truncation=True, max_length=512)
# outputs = embedding_model(**inputs)

# # Generate the query embedding
# query_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

In [13]:
processed_data[0]

{'values': [0.029133327305316925,
  -0.4263046681880951,
  0.5776762962341309,
  -0.3369669020175934,
  0.029599228873848915,
  -0.19629095494747162,
  0.0048916577361524105,
  -0.08580959588289261,
  -0.1606977880001068,
  0.11894181370735168,
  -0.3839860260486603,
  -0.5869916677474976,
  -0.6004518866539001,
  -0.22095468640327454,
  -0.33176133036613464,
  0.8952028751373291,
  0.04087953642010689,
  0.797085165977478,
  -0.27014070749282837,
  -0.27300703525543213,
  -0.4464336931705475,
  -0.11150935292243958,
  -0.2705315351486206,
  -0.6636171936988831,
  0.5554832220077515,
  0.3433932960033417,
  0.5688004493713379,
  0.6161620020866394,
  0.8291590213775635,
  0.20295991003513336,
  -0.2785990238189697,
  0.06164850294589996,
  -0.028354693204164505,
  0.2624891400337219,
  -0.6648006439208984,
  -0.2783375084400177,
  0.16980981826782227,
  -0.16890838742256165,
  0.08309347927570343,
  0.9992668032646179,
  -0.8587056398391724,
  0.09819765388965607,
  -0.3692297339439392

In [14]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [15]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}