In [13]:
import os
import nltk
import pinecone
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain import OpenAI

## Load documents

In [14]:
data_path="/teamspace/studios/this_studio/production-incentives-insider/data/motionpicture_dot_org"
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = DirectoryLoader(data_path, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

 31%|███▏      | 5/16 [00:11<00:24,  2.20s/it]Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 95 0 (offset 0)
Ignoring wrong pointing object 128 0 (offset 0)
Ignoring wrong pointing object 130 0 (offset 0)
Ignoring wrong pointing object 163 0 (offset 0)
Ignoring wrong pointing object 236 0 (offset 0)
Ignoring wrong pointing object 742 0 (offset 0)
Ignoring wrong pointing object 801 0 (offset 0)
 44%|████▍     | 7/16 [00:16<00:23,  2.61s/it]Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wr

575

## Split documents

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
split_texts = text_splitter.split_documents(documents)
split_texts

[Document(page_content='Economic Impact of the New Mexico \nFilm Production Tax Credit  \nA Study for the New Mexico \nFilm Office by Olsberg •SPI \n19th November 2021', metadata={'source': '/teamspace/studios/this_studio/production-incentives-insider/data/motionpicture_dot_org/NMFO_EconomicImpactStudy_NMFilmProductionIncentiveProgram_2021.pdf', 'page': 0}),
 Document(page_content='Economic Impact of the New Mexico Film Production Tax Credit  \n© Olsberg•SPI 20 21  19th November  2021  ii Contents  \n1. Summary of Key Findings  ................................ ................................ ................................ ...................  4 \n2. Executive Summary  ................................ ................................ ................................ ...........................  5', metadata={'source': '/teamspace/studios/this_studio/production-incentives-insider/data/motionpicture_dot_org/NMFO_EconomicImpactStudy_NMFilmProductionIncentiveProgram_2021.pdf', 'page': 1}

## Initialize Pinecone

In [16]:
from pinecone import ServerlessSpec
from pinecone import Pinecone

pc = Pinecone()
# cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
# region = os.environ.get('PINECONE_REGION') or 'us-east-1'
# spec = ServerlessSpec(cloud=cloud, region=region)

## Create or recreate index

In [17]:
import time
index_name = "motionpicturedotorg"

if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)

# we create a new index

pc.create_index(
  name=index_name,
  dimension=1536,
  metric="dotproduct",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

## Display index stats

In [18]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Initialize vectorstore

In [19]:
embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

## Load data

In [20]:
vectorstore_from_docs = PineconeVectorStore.from_documents(split_texts,index_name=index_name,embedding=embeddings)

## Display index stats

In [21]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2983}},
 'total_vector_count': 2983}

## Simple similarity search

In [22]:
query = "What are production incentives?"
vectorstore.similarity_search(query)

[Document(page_content='Developed markets boast a compelling blend of incentives, production expertise, state -of-the-\nart facilities, and a well -established supply chain.   \n3.2. Growth of Competing Production Incentive Programs  \nAs outlined, f ilm and television production incentives are a commonly utilized policy tool  \nfor legislators aiming to stimulate production activity. As well as attracting high -value out -\nof-state or international productions to a jurisdiction, an effective production incentive can', metadata={'page': 22.0, 'source': '/teamspace/studios/this_studio/production-incentives-insider/data/motionpicture_dot_org/The-Economic-Impact-of-Georgias-Entertainment-Industry-Tax-Credit-Final-Report-2023-11-06.pdf'}),
 Document(page_content='incentives that are currently active. \nIt includes all automatic national, state and \nprovince-level incentives relevant to inward \ninvestment productions rather than solely local \nprojects, and which generally return a share