In [2]:
import pdfplumber

pdf_file = 'knowledge-base/02 Embeddings.pdf'

with pdfplumber.open(pdf_file) as pdf:
    extracted_text = ''
    for page in pdf.pages:
        extracted_text += page.extract_text()

print(extracted_text)

02 Embeddings
02 Embeddings
What are Embeddings?
Embeddings are dense vector representations of data in a continuous vector space. They capture
semantic meaning and contextual relationships by mapping discrete entities (like words, sentences,
images, or user behaviors) to points in a multi-dimensional space where similar items are positioned
closer together.
Unlike traditional sparse representations (like one-hot encoding), embeddings are:
Dense: They use a fixed number of dimensions to represent information efficiently
Learned: They're derived from data rather than manually engineered
Continuous: They exist in a smooth vector space allowing for mathematical operations
Semantic: They capture meaningful relationships between entities
Real-World Example
Consider how we might represent colors. A traditional representation might use discrete RGB values
(255, 0, 0) for red. An embedding approach would learn that "scarlet" and "crimson" should be
positioned close to "red" in vector space, wh

In [3]:
import pdfplumber

from pprint import pprint

pdf_file = 'knowledge-base/02 Embeddings.pdf'
pdf = pdfplumber.open(pdf_file)

page = pdf.pages[0]

table_data = page.extract_tables()

pprint(table_data)

[]


In [4]:
import pdfplumber
import pandas as pd


def extract_table_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = pdf.pages
        data = []
        for page in pages:
            table = page.extract_table()
            if table:
                data.extend(table)
    return data


def save_to_excel(data, excel_path):
    df = pd.DataFrame(data)
    df.to_csv(excel_path, index=False)


if __name__ == "__main__":
    pdf_path = 'knowledge-base/02 Embeddings.pdf'
    excel_path = 'test.csv'

    extracted_data = extract_table_from_pdf(pdf_path)
    save_to_excel(extracted_data, excel_path)

In [6]:
import pdfplumber
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import PDFObjRef, resolve1
from pdfminer.psparser import PSLiteral

pdf_file = 'knowledge-base/02 Embeddings.pdf'

def resolve_dest(dest, doc):
    """Resolve a PDF destination object into its underlying data."""
    if isinstance(dest, str):
        dest = resolve1(doc.get_dest(dest))
    elif isinstance(dest, PSLiteral):
        dest = resolve1(doc.get_dest(dest.name))
    if isinstance(dest, dict):
        dest = dest.get('D')
    if isinstance(dest, PDFObjRef):
        dest = dest.resolve()
    return dest

with pdfplumber.open(pdf_file) as pdf:
    pdf_doc = pdf.doc
    outlines = pdf_doc.get_outlines()

    # Map page IDs to page numbers
    pages = {page.pageid: pageno for (pageno, page)
             in enumerate(PDFPage.create_pages(pdf_doc), 1)}
    
    for (level, title, dest, a, se) in outlines:
        pageno = None
        
        if dest:
            dest = resolve_dest(dest, pdf_doc)
            if isinstance(dest, list) and len(dest) > 0 and hasattr(dest[0], 'objid'):
                pageno = pages.get(dest[0].objid)
        
        elif a:
            if isinstance(a, dict):
                subtype = a.get('S')
                if subtype and repr(subtype) == "/'GoTo'" and a.get('D'):
                    dest = resolve_dest(a['D'], pdf_doc)
                    if isinstance(dest, list) and len(dest) > 0 and hasattr(dest[0], 'objid'):
                        pageno = pages.get(dest[0].objid)

        print(f"level={level}, title={title}, page={pageno}")


level=1, title=02 Embeddings, page=1
level=1, title=What are Embeddings?, page=1
level=2, title=Real-World Example, page=1
level=2, title=How Embeddings Work, page=1
level=3, title=The Training Process, page=1
level=3, title=Vector Operations, page=1
level=3, title=Dimensionality, page=1
level=3, title=Real-World Example, page=1
level=2, title=Use Cases of Embeddings, page=2
level=3, title=Natural Language Processing, page=2
level=3, title=Computer Vision, page=2
level=3, title=Recommender Systems, page=2
level=3, title=Graph Analysis, page=2
level=3, title=Multimodal Applications, page=2
level=3, title=Real-World Example, page=2
level=2, title=Implementing Embeddings with Hugging Face (Free Models), page=2
level=3, title=Text Embeddings, page=3
level=4, title=Sentence Transformers, page=3
level=4, title=Using Transformers Directly, page=3
level=3, title=Image Embeddings, page=3
level=3, title=Multimodal Embeddings (Text and Image), page=4
level=2, title=Building a Semantic Search Syst