# Build RAG for reviews data

In [None]:
%pip install llama-index
%pip install llama-index-embeddings-ollama
%pip install llama-index-llms-ollama
!pip install colab-xterm
%load_ext colabxterm
%pip install llama-index-embeddings-huggingface

In [None]:
%xterm

# curl -fsSL https://ollama.com/install.sh | sh
# ollama serve &
# ollama pull tinyllama

In [None]:
import pandas as pd
from pathlib import Path

from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Document, DocumentSummaryIndex, SimpleDirectoryReader, VectorStoreIndex, PromptTemplate, Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

## Preprocess data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# p = '/content/drive/MyDrive/Colab Notebooks/MEKARI_REVIEWS.csv'
p = '/content/drive/MyDrive/MEKARI_REVIEWS.csv'
df = pd.read_csv(p)

In [None]:
documents = []
for i in range(50000):
    row = df.iloc[i]
    r = f"{row['review_text']}"
    documents.append(Document(text=r))

## Build vector index from documents

In [None]:
# embed_model = OllamaEmbedding(
#     model_name="tinyllama:latest",
#     base_url="http://localhost:11434",
# )

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

llm = Ollama(model="tinyllama:latest", request_timeout=120.0)

Settings.embed_model = embed_model
Settings.llm = llm

In [None]:
# from llama_index.core.node_parser import SentenceSplitter
# from llama_index.core.indices.document_summary import DocumentSummaryIndexLLMRetriever

# splitter = SentenceSplitter(chunk_size=1024)

# index = DocumentSummaryIndex.from_documents(
#     documents,
#     llm=llm,
#     transformations=[splitter],
#     show_progress=True,
# )

# retriever = DocumentSummaryIndexLLMRetriever(
#     index,
#     choice_top_k=2,
# )

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
)

## Prepare prompt

In [None]:
query = 'what are most frequent spotify reviews?'
query_context = retriever.retrieve(query)

context_str = ''
for i in range(len(query_context)):
    context_str += query_context[0].get_content() + '. \n'

In [None]:
for i in range(len(query_context)):
    print(query_context[i].text, query_context[i].score)

Spotify is number one 0.8136731813565282
I listen to Spotify every day! Great quality, and it has most of the songs I look for! 0.8049787874569584
Best of on spotify 0.8030445132185673


In [None]:
template = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question: {query_str}\n"
)
qa_template = PromptTemplate(template)

In [None]:
prompt = qa_template.format(context_str=context_str, query_str=query)

## Get response from LLM

In [None]:
resp = llm.complete(prompt)

In [None]:
resp.text

## Save vector index

In [None]:
index.storage_context.persist("bot_index")

In [None]:
!zip -r bot_index.zip bot_index

  adding: bot_index/ (stored 0%)
  adding: bot_index/graph_store.json (stored 0%)
  adding: bot_index/image__vector_store.json (deflated 19%)
  adding: bot_index/index_store.json (deflated 68%)
  adding: bot_index/.ipynb_checkpoints/ (stored 0%)
  adding: bot_index/docstore.json (deflated 72%)
  adding: bot_index/default__vector_store.json (deflated 58%)


In [None]:
from google.colab import files
files.download('/content/bot_index.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>