In [6]:
# from langchain.document_loaders import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain.utilities import ApifyWrapper

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
os.environ["APIFY_API_TOKEN"] = ""

apify = ApifyWrapper()

In [48]:
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={"startUrls": [{"url": ""}]},
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [60]:
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
       chunk_size=1000, 
        chunk_overlap=100, 
        length_function=len, 
        is_separator_regex=False,
        separators=["\n\n", "\n", " ", ""]
)
docs_chunks = text_splitter.split_documents(data)
print(len(docs_chunks))

47


In [61]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

# initialize pinecone
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["PINECONE_ENV"]
)
index_name = "esg-index" 

In [62]:
embeddings = OpenAIEmbeddings()
#create a new index
docsearch = Pinecone.from_documents(docs_chunks, embeddings, index_name=index_name)

In [None]:
index = VectorstoreIndexCreator().from_loaders([loader])



In [None]:
query = "What is LangChain?"
result = index.query_with_sources(query)

In [None]:
print(result["answer"])
print(result["sources"])

In [9]:
from langchain.document_loaders import CSVLoader

In [57]:
loader = CSVLoader('ESG_data_updated.csv', encoding='utf-8') 

In [58]:
data = loader.load()

In [59]:
len(data)

22