In [69]:
import pandas as pd
import sqlite3
import json
import os
import re
from typing import Any
import requests
from bs4 import BeautifulSoup, ResultSet
from copy import deepcopy
from IPython.display import HTML, display
from pathlib import Path
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC
from langchain.document_loaders import TextLoader
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.readers import download_loader
from llama_index.core.ingestion.pipeline import IngestionPipeline
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.schema import Document, TransformComponent
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.file import PDFReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from unstructured.partition.pdf import partition_pdf
from openai import OpenAI




In [64]:
!pip install -qU \
    "pinecone-client[grpc]"==3.2.2 \
    "unstructured[pdf]"==0.12.4 \
    langchain==0.1.9 \
    llama-index==0.10.23 \
    llama-index-vector-stores-pinecone==0.1.4 \
    pillow==10.0.0 \
    poppler-utils==0.1.0 \
    pytesseract==0.3.10

## Load + Clean +SQL Queries + Question and Answers

In [47]:
#Load the CSV file
huddf = pd.read_csv('data/hud_insured_multifamily.csv')

  huddf = pd.read_csv('data/hud_insured_multifamily.csv')


In [48]:
# Data cleaning
huddf = huddf.drop(columns=["PROJECT_MANAGER_NAME_TEXT", "AUTOMATIC_GEOCODE_IND"])
huddf = huddf[["STD_CITY","PROPERTY_ID","PROPERTY_CATEGORY_NAME","PROPERTY_NAME_TEXT"]]
huddf.to_csv('data/hud_insured_multifamily_cle.csv', index=False)


In [49]:
# Connect to a SQLite database
conn = sqlite3.connect('database.db')


In [None]:
###

In [50]:
# TABLE command
create_table_command = """
CREATE TABLE IF NOT EXISTS HUD_Insured_Multifamily (
    STD_CITY,
    PROPERTY_ID,
    PROPERTY_CATEGORY_NAME,
    Property_NAME_TEXT 
)
"""
conn.execute(create_table_command)
conn.commit()

In [51]:
# Function to insert values from CSV to the table
def insert_values_from_csv(conn, csv_file_path):
    df = pd.read_csv(csv_file_path)
    df.to_sql('HUD_Insured_Multifamily', conn, if_exists='replace', index=False)


In [52]:
# Insert values into the table from a CSV file
insert_values_from_csv(conn, 'data/hud_insured_multifamily_cle.csv')

In [53]:
# Query the table and print the result
table = pd.read_sql_query("SELECT * FROM HUD_Insured_Multifamily", conn)
print(table)

                           STD_CITY  PROPERTY_ID PROPERTY_CATEGORY_NAME  \
0      Pennington                    800225589.0   Insured-Unsubsidized   
1      White Settlement              800225933.0   Insured-Unsubsidized   
2      Carlinville                   800225983.0   Insured-Unsubsidized   
3      Grand Coulee                  800226573.0   Insured-Unsubsidized   
4      Vidalia                       800226611.0   Insured-Unsubsidized   
...                             ...          ...                    ...   
16519  Maplewood                     800254374.0   Insured-Unsubsidized   
16520  Garland                       800255132.0   Insured-Unsubsidized   
16521  Pleasant Hill                 800255134.0   Insured-Unsubsidized   
16522  Grasonville                   800255144.0   Insured-Unsubsidized   
16523  Southgate                     800255510.0   Insured-Unsubsidized   

                               PROPERTY_NAME_TEXT  
0                           Capital Health Syst

In [54]:
# Prepare for answering the questions
answers = []

In [55]:
# What city has the most properties?
cursor = conn.execute("SELECT STD_CITY, COUNT(*) FROM HUD_Insured_Multifamily GROUP BY STD_CITY ORDER BY COUNT(*) DESC LIMIT 1")
answers.append(f"City with most properties: {cursor.fetchone()}")


In [56]:
# How many property names start with the letter 'H'
cursor = conn.execute("SELECT COUNT(*) FROM HUD_Insured_Multifamily WHERE Property_NAME_TEXT LIKE 'H%'")
answers.append(f"Properties starting with 'H': {cursor.fetchone()[0]}")


In [57]:
# What % of the properties in the property category is 'Insured-Unsubsidized'
cursor = conn.execute("SELECT COUNT(*) FROM HUD_Insured_Multifamily WHERE PROPERTY_CATEGORY_NAME = 'Insured-Unsubsidized'")
total_count = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(*) FROM HUD_Insured_Multifamily")
overall_count = cursor.fetchone()[0]
percentage = (total_count / overall_count) * 100 if overall_count > 0 else 0
answers.append(f"Percentage of 'Insured-Unsubsidized': {percentage:.2f}%")

In [58]:
# How many unique property id are there in this data set
cursor = conn.execute("SELECT COUNT(DISTINCT PROPERTY_ID) FROM HUD_Insured_Multifamily")
answers.append(f"Unique property IDs: {cursor.fetchone()[0]}")

In [59]:
# Close the connection
conn.close()

In [60]:
# Combine questions and answers into a DataFrame
qa_df = pd.DataFrame({
    "Questions": [
        "What city has the most properties?",
        "How many property names start with the letter 'H'",
        "What % of the properties in the property category is 'Insured-Unsubsidized'",
        "How many unique property IDs are there in this data set"
    ],
    "Answers": answers
})

In [61]:
# Export the DataFrame to a CSV file
qa_df.to_csv(r'data/Questions_and_Answers.csv', index=False)

In [62]:
print(qa_df)

                                           Questions  \
0                 What city has the most properties?   
1  How many property names start with the letter 'H'   
2  What % of the properties in the property categ...   
3  How many unique property IDs are there in this...   

                                             Answers  
0  City with most properties: ('Chicago          ...  
1                  Properties starting with 'H': 801  
2       Percentage of 'Insured-Unsubsidized': 68.70%  
3                         Unique property IDs: 16523  


## PINECONE and OPENAI setup

In [72]:
os.environ['PINECONE_API_KEY'] = "d7ad4843-8f48-4f21-929e-5e293224994e"  
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize connection to Pinecone
pc = PineconeGRPC(api_key=pinecone_api_key)
index_name = "structured-data-example"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=1536,  # Dimensions match encoder (embedder/vectorizer) you will use downstream, ada-002 from OpenAI.
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

# Initialize your index
pinecone_index = pc.Index(index_name)

In [73]:
# Confirm creation of your index & that (if new) it has no vectors in it yet.
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [74]:
# If for any reason you want to delete your Pinecone index and start over, execute this code:
# pc.delete_index(index_name)

In [75]:
# Set/Get your OpenAI API Key

os.environ['OPENAI_API_KEY'] = "sk-proj-G5BRCE3LFzqPAvzcz5dAT3BlbkFJ0QzhFQRn517N4nPqPF4I"  
openai_api_key = os.getenv("OPENAI_API_KEY")

In [76]:
# Declare embedding model you will use throughout notebook:
# OpenAI's ada-002 text embedding modal is the model you will use both for Node parsing and for vectorization of PDF contents
EMBED_MODEL = OpenAIEmbedding(api_key=openai_api_key)

In [78]:
# You will need to re-define Pinecone as a LlamaIndex PineconeVectorStore obj when you add namespaces, so build a
# function to help you do that:
def initialize_vector_store(index: PineconeGRPC, namespace: str) -> PineconeVectorStore:
    """
    Initialize Pinecone index as a VectorStore obj.

    :param index: Pinecone serverless index.
    :param namespace: Namespace constraint you want on your queries, indexing operations, etc. when using this vector store.
    :return: PineconeVectorStore obj.
    """
    return PineconeVectorStore(pinecone_index=index, namespace=namespace)

In [79]:
def run_indexing_pipeline(vector_store, documents, embed_model=EMBED_MODEL):
    # Define pipeline stages
    pipeline = IngestionPipeline(
        transformations=[
            # CleanTextForOpenAI(),  # Clean doc text
            SemanticSplitterNodeParser(
                buffer_size=1,
                breakpoint_percentile_threshold=95,
                embed_model=embed_model,
                ),
            embed_model,  # Vectorize nodes
            ],
        vector_store=vector_store # Index into Pinecone
        )

    # Run documents through pipeline
    return pipeline.run(documents=documents)

In [80]:


# Declare namespace you will put your first batch of vectors into:
ctrl_namespace = 'control'

# Initialize vector store w/control namespace
ctrl_vector_store = initialize_vector_store(pinecone_index, ctrl_namespace)

# Run pipeline
output = run_indexing_pipeline(ctrl_vector_store, ctrl_docs)



NameError: name 'ctrl_docs' is not defined

In [81]:
# Confirm your docs made it to the index, in the right namespace
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}