# Setup

In [None]:
%pip install -qU langchain-text-splitters
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-postgres
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-llama-cpp
%pip install llama_index
%pip install llama-cpp-python
%pip install psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet
%pip install markupsafe==2.0.1

In [2]:
import json
import psycopg2
import numpy as np
import pandas as pd
from llama_index.core import Document
from typing import List, Any, Optional, Dict
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core.schema import TextNode, BaseNode
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core.vector_stores.types import BasePydanticVectorStore
# from llama_index.core.vector_stores import (VectorStoreQuery,VectorStoreQueryResult,)






# Dataset

In [3]:
# Read Data
f = open('corpus.json')
corpus = json.load(f)
f.close()

corpus_df = pd.DataFrame(corpus)

In [4]:
# Text Chunker
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=16,
    length_function=len,
    separators = ['\n']
)

# Chunking Text
corpus_df['chunks'] = corpus_df['body'].apply(lambda x: text_splitter.create_documents([x]))

In [5]:
# Add metadata and change format of chunk to node
def add_metadata(x):
    metadata = {
        'title': x['title'],
        'author': x['author'],
        'source': x['source'],
        'published_at': x['published_at'],
        'category': x['category'],
        'url': x['url']
    }
    
    node_chunks = []
    for chunk in x['chunks']:
        node = TextNode(
            text = chunk.page_content,
            )
        node.metadata = metadata
        node_chunks.append(node)
        
    return node_chunks

corpus_df['chunks'] = corpus_df.apply(lambda x: add_metadata(x), 1)

In [6]:
nodes = []

for i in range(len(corpus_df)):
    for n in corpus_df['chunks'][i]:
        nodes.append(n)

# Vector DB

In [6]:
db_name = "vector_db"
host = "localhost"
password = "password"
port = "5432"
user = "nikhil"
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

# Embedding

In [None]:
# LLM
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

In [1]:
# Sentence transformers for embedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

NameError: name 'HuggingFaceEmbedding' is not defined