In [28]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'f2c88f45-2d0c-4d8a-849d-21bfc78dc15d'

# configure client
pc = Pinecone(api_key=api_key)

In [29]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [30]:
# Giving our index a name
index_name = "hello-pinecone"

In [5]:
# Delete the index, if an index of the same name already exists
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

In [6]:
import time

dimensions = 3
pc.create_index(
    name=index_name,
    dimension=dimensions,
    metric="cosine",
    spec=spec
)

# wait for index to be ready before connecting
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [7]:
index = pc.Index(index_name)

In [50]:
import pandas as pd

df = pd.DataFrame(
    data={
        "id": ["A", "B"],
        "vector": [[1., 1., 1.], [1., 2., 3.]]
    })
df

Unnamed: 0,id,vector
0,A,"[1.0, 1.0, 1.0]"
1,B,"[1.0, 2.0, 3.0]"


In [9]:
index.upsert(vectors=zip(df.id, df.vector))  # insert vectors

{'upserted_count': 2}

In [10]:
index.describe_index_stats()

{'dimension': 3,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [11]:
index.query(
    vector=[2., 2., 2.],
    top_k=5,
    include_values=True) # returns top_k matches

{'matches': [{'id': 'A', 'score': 1.0, 'values': [1.0, 1.0, 1.0]},
             {'id': 'B', 'score': 0.925820112, 'values': [1.0, 2.0, 3.0]}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [None]:
pc.delete_index(index_name)

In [78]:
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
load_dotenv()
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders.csv_loader import CSVLoader

In [79]:
load_dotenv()

True

In [17]:
pdf_document = fitz.open("1.pdf")
text = ""
for page_number in range(len(pdf_document)):
    page = pdf_document[page_number]
    
    # Extract text from the current page
    page_text = page.get_text()
    
    # Append the extracted text to the overall text string
    text += page_text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
chunks = text_splitter.split_text(text=text)
embeddings = OpenAIEmbeddings()
#VectorStore = FAISS.from_texts(chunks,embeddings)

In [86]:
docsearch = PineconeVectorStore.from_texts(chunks, embeddings, index_name="cv-vector-database")

In [75]:
loader = CSVLoader(file_path="resumes_scores.csv")
documents = loader.load()
embeddings = OpenAIEmbeddings()
#db = FAISS.from_documents(documents, embeddings)

In [None]:
docsearch = PineconeVectorStore.from_documents(chunks, embeddings, index_name="cv-vector-database")

In [88]:
query = "What is the university of Do Minh Quang"
ans = docsearch.similarity_search(query, k=3)

UnauthorizedException: (401)
Reason: Unauthorized
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 25 Apr 2024 13:43:33 GMT', 'Content-Type': 'text/plain', 'Content-Length': '12', 'Connection': 'keep-alive', 'x-pinecone-auth-rejected-reason': 'Malformed domain', 'www-authenticate': 'Malformed domain', 'server': 'envoy'})
HTTP response body: Unauthorized


In [89]:
docsearch.add_texts(["More text!"])

['430e0dc8-b9f9-4a90-949c-cb1ecd13722e']