In [8]:
!pip install pinecone-client


Collecting pinecone-client
  Using cached pinecone_client-2.2.4-py3-none-any.whl (179 kB)
Collecting dnspython>=2.0.0
  Using cached dnspython-2.4.2-py3-none-any.whl (300 kB)
Collecting loguru>=0.5.0
  Using cached loguru-0.7.2-py3-none-any.whl (62 kB)
Collecting win32-setctime>=1.0.0
  Using cached win32_setctime-1.1.0-py3-none-any.whl (3.6 kB)
Installing collected packages: win32-setctime, dnspython, loguru, pinecone-client
Successfully installed dnspython-2.4.2 loguru-0.7.2 pinecone-client-2.2.4 win32-setctime-1.1.0


In [1]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
from llama_index import (
    GPTVectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SQLDatabase, 
    
)
from llama_index.readers import WikipediaReader
from llama_index.indices.struct_store import GPTSQLStructStoreIndex

In [27]:
import pinecone
import os
from dotenv import load_dotenv

load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
index_name = os.getenv("PINECONE_INDEX_NAME")

pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
## Can run pinecone.create_index(index_name, dimension=dimension of embedding model, metrics="euclidean")
pinecone_index = pinecone.Index(index_name)

In [10]:
from llama_index.node_parser import TokenTextSplitter
from llama_index import ServiceContext, LLMPredictor
from llama_index.storage import StorageContext
from llama_index.vector_stores import PineconeVectorStore
from langchain.chat_models import ChatOpenAI

from llama_index import SimpleDirectoryReader
from llama_index.ingestion import IngestionPipeline


In [11]:
chunk_size= 1024
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106", streaming=True))
service_context = ServiceContext.from_defaults(chunk_size=chunk_size, llm_predictor=llm_predictor)
node_parser = TokenTextSplitter(chunk_size = chunk_size)


# Define pinecone vector_index
vector_store = PineconeVectorStore(pinecone_index=pinecone_index, namespace="wiki_cities")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = GPTVectorStoreIndex([], storage_context=storage_context)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffr\AppData\Local\llama_index...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [12]:
from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, select, column

In [13]:
engine = create_engine("sqlite:///:memory:", future=True)
metadata_obj = MetaData()

In [14]:
# Create city SQL table
table_name = "city_stats"
city_stats_table = Table(
    table_name,
    metadata_obj, 
    Column("city_name", String(16), primary_key=True),
    Column("population",Integer),
    Column("country", String(20), nullable=False)
)

metadata_obj.create_all(engine)

In [15]:
metadata_obj.tables.keys()

dict_keys(['city_stats'])

In [16]:
# Introduce data into the city_stats table
from sqlalchemy import insert

rows= [
    {"city_name": "Toronto", "population": 293000, "country":"Canada"},
    {"city_name": "Tokyo", "population": 13960000, "country": "Japan"},
    {"city_name": "Berlin", "population": 3645000, "country": "Germany"}
]

for row in rows:
    statement =  insert(city_stats_table).values(**row)
    with engine.connect() as connection:
        cursor = connection.execute(statement)
        connection.commit()
        

In [18]:
with engine.connect() as connection:
    cursor = connection.exec_driver_sql("SELECT * FROM city_stats")
    print(cursor.fetchall())

[('Toronto', 293000, 'Canada'), ('Tokyo', 13960000, 'Japan'), ('Berlin', 3645000, 'Germany')]


In [19]:
!pip install wikipedia

cities = {"Torono", "Berlin", "Tokyo"}
wiki_docs = WikipediaReader().load_data(pages=cities)

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py): started
  Building wheel for wikipedia (setup.py): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11707 sha256=6233ab5eb95be028690a5c083d7bd56914fa810070fe189437da1a21224d9e2a
  Stored in directory: c:\users\jeffr\appdata\local\pip\cache\wheels\b2\7f\26\524faff9145e274da278dc97d63ab0bfde1f791ecf101a9c95
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


**Build SQL Index**

In [20]:
sql_database = SQLDatabase(engine, include_tables = ["city_stats"])

In [21]:
sql_index = GPTSQLStructStoreIndex.from_documents(
    [],
    sql_database = sql_database,
    table_name="city_stats",
)

In [29]:
for city, wiki_doc in zip(cities, wiki_docs):
    nodes = node_parser.get_nodes_from_documents([wiki_doc])
    #for node in nodes:
        #node.extra_info = {"title": city}
    vector_index.insert_nodes(nodes)

Upserted vectors:   0%|          | 0/11 [00:00<?, ?it/s]

PineconeProtocolError: Failed to connect; did you specify the correct index name?

In [None]:
from llama_index.query_engine import SQLAutoVectorQueryEngine, RetrieverQueryEngine
from llama_index.tools.query_engine import QueryEngineTool
from llama_index.indices.vector_store import VectorIndexAutoRetriever

sql_query_engine = sql_index.as_query_engine(synthesize_response=True)

In [31]:
from llama_index.indices.vector_store.retrievers import VectorIndexAutoRetriever
from llama_index.vector_stores.types import MetadataInfo,VectorStoreInfo
from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine

In [None]:
vector_store_info  = VectorStoreInfo(
    content_info = "articles about different cities",
    metadata_info = [
        MetadataInfo(
            name = "title",
            type = "str",
            description="The name of the city"
        )
    ]
)

vector_auto_retriever = VectorIndexAutoRetriever(vector_index, vector_store_info=vector_store_info)
retriever_query_engine = RetrieverQueryEngine.from_args(
    vector_auto_retriever, service_context = service_context
)

In [None]:
sql_tool = QueryEngineTool.from_args(
    query_engine = sql_query_engine,
    description =(
        "Useful for translating a natural language qury into a sql query over a table containing: "
        "city_stats, containing the population/country of each city"
    )
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine = retriever_query_engine,
    description = f"Useful for answering semantic questions about different cities."
)

**Define SQLAutoVectorQueryEngine**

In [32]:
query_engine = SQLAutoVectorQueryEngine(
    sql_tool,
    vector_tool,
    service_context = service_context
)

NameError: name 'SQLAutoVectorQueryEngine' is not defined

In [None]:
response = query_engine.query("Tell me about the arts and culture of the city with the highest population")

In [None]:
print(str(response))

In [None]:
response = query_engine.query("Tellme about the history of Berlin")

In [None]:
print(str(response))

In [None]:
response = query_engine.query("Can you give me the country corresponding to each city?")

In [None]:
print(str(response))