In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, QueryBundle
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
HF_TOKEN = os.getenv("HUGGING_FACE_TOKEN")
API_KEY = os.getenv("GEMINI_API_KEY")
os.environ["http_proxy"] = "http://127.0.0.1:2081"

In [2]:
import torch
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version compiled:", torch.version.cuda)
print("Device count:", torch.cuda.device_count())

PyTorch: 2.8.0+cu128
CUDA available: True
CUDA version compiled: 12.8
Device count: 1


In [4]:
csv_file = '../data/final/final_restaurant_reviews.csv'
df = pd.read_csv(csv_file)

In [5]:
df.head()

Unnamed: 0,restaurant_name,review,rating,rating_category,location
0,St Honore Pastries,This is nice little Chinese bakery in the hear...,4.0,Positive,"Philadelphia, PA, 935 Race St"
1,St Honore Pastries,This is the bakery I usually go to in Chinatow...,4.0,Positive,"Philadelphia, PA, 935 Race St"
2,St Honore Pastries,"A delightful find in Chinatown! Very clean, an...",5.0,Positive,"Philadelphia, PA, 935 Race St"
3,St Honore Pastries,I ordered a graduation cake for my niece and i...,5.0,Positive,"Philadelphia, PA, 935 Race St"
4,St Honore Pastries,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,4.0,Positive,"Philadelphia, PA, 935 Race St"


In [35]:
df['review'][0]

"This is nice little Chinese bakery in the heart of Philadelphia's Chinatown! The female cashier was very friendly (flirtatious!) and the pastries shown in nicely adorned display cases. I stopped by early one evening had a sesame ball, which was filled with bean paste. The glutinous rice of the ball was nicely flavored, similar to Bai Tang Gao. Definitely as place worth stopping at if you are in the area."

In [5]:
# Step 1: Load the df into LlamaIndex as Document objects
records = df.to_dict('records')
documents = [
    Document(
        text=record['review'],
        metadata={
            'restaurant_name': record['restaurant_name'],
            'review': record['review'],
            'rating_category': record['rating_category'],
            'rating': record['rating'],
            'location': record['location'],
        }
        )
    for record in records
]

In [7]:
# Step 2: Generate embeddings for the review field
# Use Huggingface embeddings
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.chunk_size = 2048  # Increase chunk size to handle large metadata

In [None]:
# Use Gemini embeddings
# Settings.embed_model = GeminiEmbedding(model="models/text-embedding-004", api_key=API_KEY)
# Settings.chunk_size = 2048

In [8]:
# Configure LlamaIndex vector store
vector_store = ElasticsearchStore(
    index_name='yelp_restaurant_reviews',
    vector_field='review_vector',
    text_field='review',
    es_url='http://localhost:9200/')
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [9]:
# Index documents
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
index.set_index_id("vector_index")
index.storage_context.persist(persist_dir="./rag_storage")

NLTK download error: File is not a zip file


In [None]:
# Use this block later to load the storage
# storage_context = StorageContext.from_defaults(persist_dir="./rag_storage")
# index = load_index_from_storage(storage_context=storage_context, index_id="vector_index")