In [1]:
from decouple import config
from openai import OpenAI

In [2]:
OPENAI_API_KEY = config("OPENAI_API_KEY")
UPSTASH_VECTOR_REST_URL = config("UPSTASH_VECTOR_REST_URL")
UPSTASH_VECTOR_REST_TOKEN = config("UPSTASH_VECTOR_REST_TOKEN")

In [3]:
import os
os.environ["OPENAI_API_KEY"]=OPENAI_API_KEY

In [4]:
import wikipedia 

In [5]:
ny=wikipedia.page(title="New York city, New York")

In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")

In [7]:
from langchain_community.vectorstores import UpstashVectorStore

store = UpstashVectorStore(
    embedding=embeddings,
    index_url=UPSTASH_VECTOR_REST_URL,
    index_token=UPSTASH_VECTOR_REST_TOKEN
)


In [8]:
wikipedia.search("Uttara, Dhaka")

['Uttara (neighbourhood)',
 'Uttara University',
 'Uttara North metro station',
 'RAJUK Uttara Model College',
 'Centrepoint, Dhaka',
 'Uttara High School and College',
 'Dhaka Metro Rail',
 'Uttara Center metro station',
 'Uttara West Thana',
 'Uttara Group of Companies']

In [9]:
from langchain_core.documents import Document
documents=[]
cities=["Uttara, Dhaka","Boise,Idaho"]
for city in cities:
    wikipedia_page_result=wikipedia.page(title=city)
    doc=Document(
        page_content=wikipedia_page_result.content,
        metadata={
            "source":f"{wikipedia_page_result.url}",
            "title":city
        }
    )
    documents.append(doc)

In [10]:
documents[0].metadata

{'source': 'https://en.wikipedia.org/wiki/Uttara_(neighbourhood)',
 'title': 'Uttara, Dhaka'}

In [14]:
len(documents)

2

In [11]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TokenTextSplitter

OPENAI_LLM_MODEL = "gpt-4o"
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    model_name=OPENAI_LLM_MODEL, chunk_size=100, chunk_overlap=0
)

In [12]:
docs=text_splitter.split_documents(documents)

In [13]:
len(docs)

131

In [15]:
inserted_vectors=store.add_documents(docs)

In [19]:
result=store.similarity_search_with_score("The city named after trees", k=5)
for doc,score in result:
    print(f"{doc.metadata}-{score}")

{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise,Idaho'}-0.7419841
{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise,Idaho'}-0.7313181
{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise,Idaho'}-0.7095957
{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise,Idaho'}-0.6820013
{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise,Idaho'}-0.6707239
