# Pinecone Vector Generator

Import all possible requirements

In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path
from langchain.document_loaders import ReadTheDocsLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone

Import the environment variables (Input your own Pinecone info)

In [None]:
env_path = Path(__file__).parent / ".env"
load_dotenv(dotenv_path=env_path)
PINE_API_KEY = os.getenv("PINE_API_KEY")
PINE_ENV = os.getenv("PINE_ENV")

### Read The Docs website loader
Use this if you want to vectorize a "readthedocs" based wiki

In the local directory, you will need to run this command to download the site locally before vectorizing:
```
wget -r -A.html https://python.langchain.com/en/latest/
```

In [None]:
DOCS_URL = "https://docs.readthedocs.io/en/stable/"
INDEX_NAME = "Index-Name"

In [None]:
loader = ReadTheDocsLoader(DOCS_URL)
raw_documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
documents = text_splitter.split_documents(raw_documents)
embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(documents, embeddings, index_name=index_name)

### Confluence site loader
Use this if you want to vectorize a Confluence/Atlassian based wiki

In [None]:
URL = "https://yoursite.atlassian.com/wiki"
USERNAME="yourusername"
APIKEY="yourapikey"
INDEX_NAME = "Index-Name"

In [None]:
from langchain.document_loaders import ConfluenceLoader

loader = ConfluenceLoader(
    url=URL,
    username=USERNAME,
    api_key=APIKEY
)
documents = loader.load(space_key="SPACE", include_attachments=True, limit=50)

embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(documents, embeddings, index_name=index_name)

### CSV loader
Use this if you want to vectorize a CSV file

In [None]:
FILEPATH="./path/to/your/file.csv"
INDEX_NAME = "Index-Name"

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path=FILEPATH)
data = loader.load()

embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(data, embeddings, index_name=index_name)

### Git loader - Local
Use this if you want to vectorize a local git repo

In [None]:
REMOTE_REPO="https://github.com/masrad/ALFRED"
LOCAL_PATH="./alfred"
INDEX_NAME = "Index-Name"

Needs the following module:
```
pip install gitpython
```

In [None]:
from git import Repo

repo = Repo.clone_from(
    REMOTE_REPO, to_path=LOCAL_PATH
)
branch = repo.head.reference

from langchain.document_loaders import GitLoader
loader = GitLoader(repo_path=LOCAL_PATH, branch=branch)
data = loader.load()

embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(data, embeddings, index_name=index_name)

### Git loader - Remote
Use this if you want to vectorize a remote git repo from the URL

In [None]:
REMOTE_REPO="https://github.com/masrad/ALFRED"
LOCAL_PATH="./alfred"
BRANCH="master"
INDEX_NAME = "Index-Name"

In [None]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url=REMOTE_REPO,
    repo_path=LOCAL_PATH,
    branch=BRANCH,
)
data = loader.load()

embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(data, embeddings, index_name=index_name)

### Huggingface Dataset
Use this if you want to vectorize a huggingface dataset

In [None]:
DATASET_NAME="imdb"
PAGE_CONTENT="text"
INDEX_NAME = "Index-Name"

In [None]:
from langchain.document_loaders import HuggingFaceDatasetLoader

dataset_name=DATASET_NAME
page_content_column=PAGE_CONTENT


loader=HuggingFaceDatasetLoader(dataset_name,page_content_column)
data = loader.load()

embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(data, embeddings, index_name=index_name)

### PDF File
Use this if you want to vectorize a PDF file

There are tons more ways to do this, check the docs:

https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html

In [None]:
PDF_PATH="./path/to/your/file.pdf"
INDEX_NAME = "Index-Name"

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader(PDF_PATH)

data = loader.load()

embeddings = OpenAIEmbeddings()

pinecone.init(
    api_key=PINE_API_KEY,
    environment=PINE_ENV
)

index_name = INDEX_NAME

docsearch = Pinecone.from_documents(data, embeddings, index_name=index_name)