# Semantic search

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
_ = load_dotenv(find_dotenv())

### Creating documents manualy

In [None]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="This is a test document",
        metadata={"title": "Test Document", "author": "Test Author"},
    ),
    Document(
        page_content="This is another test document",
        metadata={"title": "Another Test Document", "author": "Another Test Author"},
    ),
]

### Document loaders

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "~/Downloads/CV_petr_olivka_general.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

In [None]:
print(f"{docs[0].page_content[:100]}...")
print(docs[0].metadata)

### Splitting the doucment content

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

### Embiddings

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}")
print(vector_1[:10])