In [23]:
import os
from dotenv import load_dotenv
import json
import csv
csv.field_size_limit(1000000)

131072

In [24]:
load_dotenv()  # This will load the environment variables from your .env file
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [25]:
from langchain.document_loaders.csv_loader import CSVLoader

csv_file_path = "../data/Cleaned_Text/doris_cleaned_texts.csv"
loader = CSVLoader(file_path=csv_file_path)

documents = loader.load()

In [28]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

chunked_documents = []
for document in documents:
    # Chunk the document
    texts = text_splitter.split_documents([document])
    # Add the chunks to chunked_documents, which is a list of lists
    chunked_documents.append(texts)
    print(f"chunked_document length: {len(texts)}")

chunked_document length: 70
chunked_document length: 27
chunked_document length: 214
chunked_document length: 29
chunked_document length: 27
chunked_document length: 37
chunked_document length: 24
chunked_document length: 145
chunked_document length: 84
chunked_document length: 47
chunked_document length: 50
chunked_document length: 146
chunked_document length: 110
chunked_document length: 928
chunked_document length: 184
chunked_document length: 35
chunked_document length: 31
chunked_document length: 39


In [31]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-2.2.1-py3-none-any.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting loguru>=0.5.0
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dnspython>=2.0.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: loguru, dnspython, pinecone-client
Successfully installed dnspython-2.3.0 loguru-0.6.0 pinecone-client-2.2.1


In [33]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.3.3-cp39-cp39-macosx_11_0_arm64.whl (706 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m706.8/706.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.3.3


In [35]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import os
from dotenv import load_dotenv

# Load API keys from the .env file
load_dotenv()

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initialize Pinecone
pinecone.init(api_key=PINECONE_API_KEY)

index_name = "doris-project-vectors"
# Upsert annual reports to Pinecone via LangChain.
# There's likely a better way to do this instead of Pinecone.from_texts()
for chunks in chunked_documents:
    Pinecone.from_texts([chunk.page_content for chunk in chunks], embeddings, index_name=index_name)

In [37]:
# Retrieve the document vector embeddings from Pinecone
vectorstore = Pinecone.from_existing_index(index_name="doris-project-vectors", embedding=embeddings)

In [38]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm)

In [39]:
query = "What year is NYC's most recent Vision report from?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 The most recent Vision report from NYC is from 2018.


In [40]:
query = "Can you summarize the main goals of Vision Zero?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 The main goals of Vision Zero are to eliminate traffic fatalities and crash-related serious injuries by 2024 through safer street designs, engineering, enforcement, deterrence of traffic safety violations, and education and outreach efforts to encourage safer choices by drivers, cyclists, and pedestrians.


In [41]:
query = "What is DOT doing about off hour deliveries?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 DOT is encouraging off hour deliveries with a focus on large buildings in areas with high pedestrian and bicycle activity. They are also working with the trucking industry to pilot low-noise truck technologies and deploy a network of noise monitors and cameras to monitor off hour delivery activity.


In [42]:
query = "Which neighborhoods is DOT focusing its work on off hour deliveries?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 Manhattan, Downtown Brooklyn, and Jamaica.


In [43]:
query = "what are the main recommendations from the 2016 report of the NYC electric vehicle advisory committee?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 The main recommendations from the 2016 report of the NYC electric vehicle advisory committee are to implement a grant to encourage workplace charging, improve data collection, and update on workplace charging.


In [44]:
query = "what are the most innovative ideas from the dot report?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 The DOT report suggests allocating more street space to walking, biking, and buses, exploring new sensor and camera technology, curb regulations and pricing strategies to better manage streets and curb space, adapting proactively to shared-use mobility services and autonomous vehicles, and developing videos on projects and priorities.


In [45]:
query = "what are the most innovative ideas from the dot reports? Mention the names of the reports that each idea comes from."
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 The most innovative ideas from the DOT reports include: 1) Partnering with employees to identify key safety issues (Employee Safety Survey); 2) Using data to better understand risks to employees (Employee Safety Survey); 3) Providing comprehensive training programs for all staff (Strategic Plan 2016); 4) Creating a searchable in-house digital warehouse of public outreach materials (Strategic Plan 2016); 5) Hiring a senior economist to help quantify the benefits and costs of current and proposed initiatives (Strategic Plan 2016); 6) Incorporating triple bottom line social-economic-environmental accounting into appraisals of agency projects and initiatives (Strategic Plan 2016); 7) Initiating development of an online map of agency projects (Strategic Plan 2016).


In [46]:
query = "what were the top 5 pedestrian crash intersections in 2014?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 W 23rd St & Avenue of the Americas (Manhattan) - 5; W 57th St & 10th Ave (Manhattan) - 5; Bath Ave & Bay Pkwy (Brooklyn) - 4; Bruckner & E 138th St/Bruckner Blvd (Bronx) - 4; E 233rd St & Grenada Pl/Baychester Ave (Bronx) - 4.


In [47]:
query = "what specific actions has dot taken to reduce pedestrian crashes in bronx community board 6?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 DOT has implemented Vision Zero capital redesigns on major streets including the Grand Concourse in the Bronx. They have also conducted targeted outreach in 500 schools each year to educate students about protecting themselves as safe pedestrians and working with their families for safer school zones. They have also completed 50 street improvement projects that enhanced safety by reengineering intersections and corridors.


In [48]:
query = "what specific actions has dot taken to reduce pedestrian crashes in bronx community board 6? For each action, mention the name of the report that it comes from."
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 According to the Vision Zero Year Three Report, DOT has implemented pedestrian safety improvements on the Grand Concourse in Bronx Community Board 6. Additionally, DOT has conducted targeted outreach in 500 schools each year educating students about protecting themselves as safe pedestrians and working with their families for safer school zones. This information can also be found in the Vision Zero Year Three Report.


In [49]:
query = "How many total citibike trips were there in december 2013?"
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 448350


In [50]:
query = "what is being done to increase accessability on ferries? cite the reports you are using."
docs = vectorstore.similarity_search(query, include_metadata=True)
answer = chain.run(input_documents=docs, question=query)
print(answer)

 According to the reports, as of March of 2010 ferry schedules are offered in Braille and in six foreign languages such as Italian, Spanish, Chinese, Haitian, Korean, and Russian. In March of 2011, other documents were made available in Braille including ferry safety announcements and fire/emergency procedures. Additionally, as of September 2017, lower level boarding is now available to all passengers. Furthermore, DOT budgeted 14 million for fiscal year 2008 towards adapting city-owned commuter ferry facilities to comply with Chapter 7 of Title 19 of the Administrative Code. All construction was completed by year end 2012.
