In [None]:
from docling.chunking import HybridChunker
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
import pandas as pd

pdf_pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=True)

doc_converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(
        pipeline_options=pdf_pipeline_options
    )}
)
chunker = HybridChunker(max_tokens=4096)

# You can replace this with any list of PDFs you want
# 'file' can be a URL or a local filename for a PDF
PDFs = [
    {'title': "Institute of Business Administration Program Annoucments", 'file': "data/pa.pdf"},
]

data = []
chunk_id = 0
for pdf in PDFs:
    print("Downloading and parsing", pdf['title'])
    doc = doc_converter.convert(pdf['file']).document
    
    # Extract tables
    tables = doc.tables
    if tables:
        # Convert the first table to a Pandas DataFrame
        df = pd.DataFrame(tables[0])
        print(df.head())  # View the extracted table
    
    for chunk in chunker.chunk(dl_doc=doc):
        chunk_dict = chunk.model_dump()
        filename = chunk_dict['meta']['origin']['filename']
        heading = chunk_dict['meta']['headings'][0] if chunk_dict['meta']['headings'] else None
        page_num = chunk_dict['meta']['doc_items'][0]['prov'][0]['page_no']
        data.append(
            {"id": chunk_id, "text": chunk.text, "title": pdf['title'], "filename": filename, "heading": heading, "page_num": page_num}
        )
        chunk_id+=1
        print(chunk_id)
    print("done parsing document")



In [15]:
print(data)
#save data to file
df = pd.DataFrame(data)
df.to_csv("data/extracted_text2.csv", index=False)
print("saved data to extracted_text.csv")

[{'id': 0, 'text': '03', 'title': 'Institute of Business Administration Program Annoucments', 'filename': 'pa.pdf', 'heading': None, 'page_num': 3}, {'id': 1, 'text': 'Programs on Offer\nBBA\nBS (Accounting and Finance)\nMBA\nEMBA\nMS (Finance)\nMS (Islamic Banking and Finance)\nMS (Management)\nMS (Marketing)\n04\n05\n08\n10\n11\n14\n15\n16\n17\n20\n22\n25\n28\n29\n31\n34\n38\n39\n40\n42\n44\n46', 'title': 'Institute of Business Administration Program Annoucments', 'filename': 'pa.pdf', 'heading': 'School of Business Studies (SBS)', 'page_num': 3}, {'id': 2, 'text': '80\n105\nPrograms on Offer, 1 = 51. BS (Economics), 1 = 54. BS (Economics and Mathematics), 1 = 59. BS (Social Sciences and Liberal Arts), 1 = 64. MS (Development Studies), 1 = 75. MS (Economics), 1 = 77. MS (Journalism), 1 = 79. PhD (Economics), 1 = . School of Mathematics & Computer Science (SMCS), 1 = 82. Programs on Offer, 1 = 83. BS (Computer Science), 1 = 86. BS (Mathematics), 1 = 92. MS (Computer Science), 1 = 95. 

In [None]:

from nomic import embed
import numpy as np
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os
load_dotenv()

def embedding_data(data):
    # embeddings = []
    # for d in data:
    # d = data[0]
    # print(d)
    output = embed.text(
        texts=data,
        model='nomic-embed-text-v1.5',
        task_type='search_document',
    )
    # print(output)

    embeddings = [np.array(embedding) for embedding in output['embeddings']]
    # print(embeddings)
    return embeddings
    # print(embeddings[0][0].shape)  # prints: (768,)

text_data = []
for d in data:
    text_data.append(d['text'])
embedded_data = embedding_data(text_data)
print(len(embedded_data))




pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "nlp"

if index_name not in pc.list_indexes():
    pc.create_index(index_name, dimension=768, metric="cosine", spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ),)

index = pc.Index(index_name)

print("Pinecone Indices: ", pc.list_indexes())


for i in range(len(embedded_data)):
    print(i)
    d = data[i]
    vector = embedded_data[i]
    # formatted_sentence = formatted_data[i]
    upsert_response = index.upsert(
    vectors=[
        {
            "id": str(i), # unique string identifier for the vector, must be provided
            "values": vector, # put the embedding vector here
            "metadata": {  # put the actual document's text here
                "text": d['text'],
                "heading": d['heading'],
                "title": d['title'],
                "source": "trusted"
                # other optional metadata
            }
        },
    ],
    namespace="nlp-module-index" # optional, defaults to "default"
)


In [None]:
load_dotenv()
print(data)
print(pc.list_indexes())

[{'id': 0, 'text': '03', 'title': 'Institute of Business Administration Program Annoucments', 'filename': 'pa.pdf', 'heading': None, 'page_num': 3}, {'id': 1, 'text': 'Programs on Offer\nBBA\nBS (Accounting and Finance)\nMBA\nEMBA\nMS (Finance)\nMS (Islamic Banking and Finance)\nMS (Management)\nMS (Marketing)\n04\n05\n08\n10\n11\n14\n15\n16\n17\n20\n22\n25\n28\n29\n31\n34\n38\n39\n40\n42\n44\n46', 'title': 'Institute of Business Administration Program Annoucments', 'filename': 'pa.pdf', 'heading': 'School of Business Studies (SBS)', 'page_num': 3}, {'id': 2, 'text': '80\n105\nPrograms on Offer, 1 = 51. BS (Economics), 1 = 54. BS (Economics and Mathematics), 1 = 59. BS (Social Sciences and Liberal Arts), 1 = 64. MS (Development Studies), 1 = 75. MS (Economics), 1 = 77. MS (Journalism), 1 = 79. PhD (Economics), 1 = . School of Mathematics & Computer Science (SMCS), 1 = 82. Programs on Offer, 1 = 83. BS (Computer Science), 1 = 86. BS (Mathematics), 1 = 92. MS (Computer Science), 1 = 95. 

In [None]:
from anyio import Path


output_dir=Path("data/tables")
for table_ix, table in enumerate(doc.tables):
    table_df: pd.DataFrame = table.export_to_dataframe()
    print(f"## Table {table_ix}")
    # print(table_df.to_markdown())
    # Save the table as csv
    element_csv_filename = output_dir / f"table-{table_ix+1}.csv"
    print(element_csv_filename)
    # _log.info(f"Saving CSV table to {element_csv_filename}")
    # table_df.to_csv(element_csv_filename)