In [1]:
import os
import nltk
import pinecone
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.docstore.document import Document
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain import OpenAI

## Load documents

In [2]:
data_path="/teamspace/studios/this_studio/production-incentives-insider/data/substack"
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = DirectoryLoader(data_path, glob="**/*.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)
documents = loader.load()
len(documents)

100%|██████████| 7/7 [00:01<00:00,  5.96it/s]


7

## Split documents

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
split_texts = text_splitter.split_documents(documents)
split_texts

[Document(page_content='A number of spectacular locations around the world offer production incentives, making shooting in paradise more affordable than you might think.\n\nThey include:\n\nTHE DOMINICAN REPUBLIC\n\nThe Dominican Republic has been attracting productions for decades, including Pirates of the Caribbean, Jurassic Park and the more recent The Lost City and Shotgun Wedding.', metadata={'source': '/teamspace/studios/this_studio/production-incentives-insider/data/substack/filming-for-less-in-paradise.md'}),
 Document(page_content='On the television front, the country has hosted countless international versions of Survivor, the Bachelor/Bachelorette franchise, and Paradise Hotel.\n\n“The DR” offers a wide range of locations, from tropical jungles and beautiful beaches to deserts and rich historical architecture.\n\nGiven its robust production scene, you’ll also find experienced crew, equipment, stellar private resorts accustomed to hosting productions, and studio space, includ

## Initialize Pinecone

In [4]:
from pinecone import ServerlessSpec
from pinecone import Pinecone

pc = Pinecone()
# cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
# region = os.environ.get('PINECONE_REGION') or 'us-east-1'
# spec = ServerlessSpec(cloud=cloud, region=region)

## Create or recreate index

In [5]:
import time
index_name = "production-incentives"

if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)

# we create a new index

pc.create_index(
  name=index_name,
  dimension=1536,
  metric="dotproduct",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

## Display index stats

In [6]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Initialize vectorstore

In [7]:
embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

## Load data

In [8]:
vectorstore_from_docs = PineconeVectorStore.from_documents(split_texts,index_name=index_name,embedding=embeddings)

## Display index stats

In [9]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Simple similarity search

In [11]:
query = "What are production incentives?"
vectorstore.similarity_search(query)

[Document(page_content='I’m often asked about the difference between the types of production incentives offered. I’ll caveat that I’m no accountant, but I can offer a basic breakdown of the four main types of production incentives you’ll encounter and what they mean for your production.\n\nREBATE', metadata={'source': '/teamspace/studios/this_studio/production-incentives-insider/data/substack/refundable-transferable-rebate-what-does-it-all-mean.md'}),
 Document(page_content='These are the holy grail of production incentives (note: some places have grants, which are similar). It’s straight-up cash that the jurisdiction will pay to a production entity (generally a percentage of qualifying spend in the jurisdiction). Because this type of incentive is not contingent on filing a tax return, they generally pay out quickly—sometimes in less than three months. Malta, Mongolia, Morocco, Mississippi, and Minnesota all offer production rebates. And lest you think it’s just an', metadata={'source'

In [12]:
query = "What are the prodution pain points?"
vectorstore.similarity_search(query)

[Document(page_content='Pain Point #1: Incentives Research\n\nFor those new to the process, there are some basic FREE TOOLS that help take the pain out of researching incentives. Of course, rules, requirements, and funding around incentives are constantly changing, so always verify!\n\nRELIABLE & COMPREHENSIVE: The Olsberg Global Incentives Index', metadata={'source': '/teamspace/studios/this_studio/production-incentives-insider/data/substack/production-pain-points-number-one.md'}),
 Document(page_content='Budgets for Content Have Never Been Tighter\n\nYet Many Unscripted Producers Continue to Ignore a Valuable Source of Potential Funding, at Their Own Peril: Production Incentives\n\nMention incentives to producers of unscripted content (e.g. documentary, lifestyle, factual, reality) and they might shrug, thinking they’re only for big budget scripted productions.\n\nOr that they’re too complicated. Or simply not worth the hassle.', metadata={'source': '/teamspace/studios/this_studio/pr