In [None]:
import nest_asyncio
nest_asyncio.apply()

In [1]:
import os
import logging
import sys
from llama_index.core import (
    SummaryIndex,
    VectorStoreIndex,
    StorageContext, 
    MockEmbedding
)
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.vector_stores.simple import SimpleVectorStore
from llama_index.core.node_parser import SentenceSplitter, MarkdownNodeParser
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# DB initial config
PERSIST_DB_DIR = "./db/db_storage/"
DATA_DIR = "./data/"
FILENAME = "llama_index_blog_posts.csv"
docstore = SimpleDocumentStore()
index_store=SimpleIndexStore()
vector_store = SimpleVectorStore()

# init the pipeline with transformations
ingestion_pipeline = IngestionPipeline(
    transformations=[
        # SentenceSplitter(chunk_size=1024, chunk_overlap=10),
        MarkdownNodeParser(),
        # TitleExtractor(),
        # QuestionsAnsweredExtractor(questions=3),
        OpenAIEmbedding(model="text-embedding-ada-002"),
        # MockEmbedding(embed_dim=1536),
    ],
    vector_store=vector_store,
    docstore=docstore,
)

# local cache of the pipeline
if not os.path.exists(PERSIST_DB_DIR):
    ingestion_pipeline.persist(PERSIST_DB_DIR)
else: 
    ingestion_pipeline.load(PERSIST_DB_DIR)

# load the documents
import csv_processor
documents = csv_processor.csv_load(DATA_DIR + FILENAME)
print(f"Loaded {len(documents)} documents.")

# run the pipeline to get nodes
nodes = ingestion_pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
)
print(f"Created {len(nodes)} nodes.")

Loaded 159 documents.


Parsing nodes: 100%|██████████| 159/159 [00:00<00:00, 933.45it/s] 
Generating embeddings:   0%|          | 0/1643 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:   6%|▌         | 100/1643 [00:02<00:42, 36.13it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  12%|█▏        | 200/1643 [00:04<00:33, 42.69it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  18%|█▊        | 300/1643 [00:07<00:30, 43.67it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  24%|██▍       | 400/1643 [00:09<00:27, 45.05it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  30%|███       | 500/1643 [00:11<00:25, 44.62it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  37%|███▋      | 600/1643 [00:13<00:22, 47.00it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  43%|████▎     | 700/1643 [00:15<00:19, 47.75it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  49%|████▊     | 800/1643 [00:17<00:17, 47.99it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  55%|█████▍    | 900/1643 [00:19<00:15, 48.57it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  61%|██████    | 1000/1643 [00:21<00:13, 47.99it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  67%|██████▋   | 1100/1643 [00:24<00:11, 45.63it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  73%|███████▎  | 1200/1643 [00:26<00:10, 44.19it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  79%|███████▉  | 1300/1643 [00:28<00:07, 43.77it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  85%|████████▌ | 1400/1643 [00:30<00:05, 45.19it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  91%|█████████▏| 1500/1643 [00:33<00:03, 44.47it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings:  97%|█████████▋| 1600/1643 [00:35<00:00, 45.39it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings: 100%|██████████| 1643/1643 [00:36<00:00, 44.81it/s]


Created 1643 nodes.


In [3]:
# init storage context
storage_context = StorageContext.from_defaults(
    docstore=docstore,
    vector_store=vector_store,
    index_store=index_store,
)

# create (or load) docstore and add nodes
storage_context.docstore.add_documents(nodes)


In [4]:
# build index + save index
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
vector_index.set_index_id("vector_index")
vector_index.storage_context.persist(persist_dir=PERSIST_DB_DIR)

summary_index = SummaryIndex(nodes,storage_context=storage_context)
summary_index.set_index_id("summarise_index")
summary_index.storage_context.persist(persist_dir=PERSIST_DB_DIR)


In [5]:
from llama_index.core import (
    load_index_from_storage,
    load_indices_from_storage,
    load_graph_from_storage,
)

storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DB_DIR)
indices = load_indices_from_storage(storage_context)

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [6]:
for index in indices:
    print(index.index_id)

vector_index
summarise_index
