In [None]:
import os, json,unstructured_client
from unstructured_client.models import operations, shared
from unstructured_client.models import operations, shared
from unstructured_client.models.errors import SDKError
from pathlib import Path
from typing import List, Tuple
from dotenv import load_dotenv
import PyPDF2
from PyPDF2 import PdfReader
import cohere
co = cohere.ClientV2(api_key=os.getenv('cohere_api_key'))
# Load environment variables from .env file
load_dotenv()
un_client = unstructured_client.UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY")
)
def get_elements_from_file(file_path:str,strategy:shared.Strategy=shared.Strategy.FAST):
    with open(file_path, "rb") as f:          # pass bytes, not a file handle
        file_bytes = f.read()
            
    filename=os.path.basename(file_path)
    req = operations.PartitionRequest(
        partition_parameters=shared.PartitionParameters(
            strategy=strategy,
            files=shared.Files(
                content=file_bytes,
                file_name=filename,
                
            ),
            
            languages=["heb", "eng"],
        )
    )

    

    try:  
        res = un_client.general.partition(request=req)
        elements = res.elements                          # list of Element objects
        
        return elements
    except SDKError as e:                                # friendlier error handling
        print(f"{e.status_code}: {e.message}")

def list_all_files(
    start: str | Path,
    exts: Tuple[str, ...] = (".pdf", ".doc", ".docx"),
) -> List[str]:
    """
    Return a list of full (absolute) paths for every *PDF, DOC, or DOCX* file
    inside *start* and all its sub-directories.

    Parameters
    ----------
    start : str | Path
        The root folder to walk. Tilde (~) is expanded and the path is
        resolved so the result is always absolute.
    exts : tuple[str, ...], optional
        File-name suffixes to include (case-insensitive). Default picks
        '.pdf', '.doc', and '.docx'.

    Raises
    ------
    FileNotFoundError
        If *start* does not exist or is not a directory.
    """
    root = Path(start).expanduser().resolve()

    if not root.is_dir():
        raise FileNotFoundError(f"{root} is not an existing directory")

    # rglob('*') walks recursively; filter by suffix (case-insensitive)
    return [
        str(p)
        for p in root.rglob("*")
        if p.is_file() and p.suffix.lower() in exts
    ]

def check_for_scanned_file(file_path:str)->bool:
    
    """
    Check if the file is scanned or not.
    
    Args:
        file (File): The File object to be checked.
        
    Returns:
        bool: True if the file is scanned, False otherwise.
    """
    pdf = PyPDF2.PdfReader(file_path)
    pages= pdf.pages
    # Check if the file has any images
    if len(pages[0].extract_text()) == 0:
        # If the first page has no text, assume it's a scanned file
        return True
    else:
        # If the first page has text, assume it's not a scanned file
        return False

In [3]:
pathes=list_all_files(r"\\nts02\IT\Personal Directories\Test\llm_testing")
pathes

['\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\01 Precalculus.formulas.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\12mg data111.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\2501.00663v1.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\2510246 ELCAM.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\2911.1988.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\6447084_ForwarderInvoice_32650841.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\ALT-DOC1985-1_180_001_20230530_pDDP_Rev. B_signed.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\amazon.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\ANSI ASQ-Z1_4;2003 - Sampling procedures and tables inspection.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\Approaching (Almost) Any Machine Learning Problem.pdf',
 '\\\\nts02\\IT\\Personal Directories\\Test\\llm_testing\\Arburb_manual.pdf',
 '\\

In [4]:
len(pathes)

172

In [10]:
for path in pathes:
    if not path.endswith(".pdf"):
        continue

    try:
        if (check_for_scanned_file(path)):
            print(f"File {os.path.basename(path) } is scanned")
        else:
            print(f"File {os.path.basename(path) } is not scanned")
        # elements = get_elements_from_file(path)
        # Print the elements
    except Exception as e:
        print(f"Error processing file {os.path.basename(path)}: {e}")
        continue
    

File 01 Precalculus.formulas.pdf is scanned
File 12mg data111.pdf is not scanned
File 2501.00663v1.pdf is not scanned
File 2510246 ELCAM.pdf is not scanned
File 2911.1988.pdf is scanned
File 6447084_ForwarderInvoice_32650841.pdf is not scanned
File ALT-DOC1985-1_180_001_20230530_pDDP_Rev. B_signed.pdf is not scanned
File amazon.pdf is not scanned
File ANSI ASQ-Z1_4;2003 - Sampling procedures and tables inspection.pdf is not scanned
File Approaching (Almost) Any Machine Learning Problem.pdf is not scanned
File Arburb_manual.pdf is not scanned
File Arburg_Operating manual.pdf is not scanned
File bf42d066-ed4e-4d4e-866c-dd11550773d1.pdf is scanned
File Catheter securement systems comparison.pdf is not scanned
File Catheter_Securement_Systems_for_Peripherally.7.pdf is not scanned
File Collaborate in Qlik Sense.pdf is not scanned
File Compliance_response_octoplant_GxP_Edition_2024_JK_DR.pdf is not scanned
Error processing file Connect-newsletter-y24w05s.pdf: PyCryptodome is required for AES

In [8]:
unstructerd_files=[]
for i in range(3):
    if pathes[i].endswith(".pdf"):
        if check_for_scanned_file(pathes[i]):
            strategy=shared.Strategy.VLM
        else:
            strategy=shared.Strategy.FAST
    elif pathes[i].endswith(".doc") or pathes[i].endswith(".docx"):
        strategy=shared.Strategy.FAST
    el=get_elements_from_file(pathes[i],strategy)
    unstructerd_files.append(el)


INFO: HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK

In [14]:
for file in unstructerd_files:
    print(file[0]['metadata']['filename'])
    


01 Precalculus.formulas.pdf
12mg data111.pdf
2501.00663v1.pdf


In [5]:
import pandas as pd
cols=['type','element_id','text','metadata','filename','page_number','text_as_html','languages','filetype','partitioner_type']
metadata_cols=['filename','page_number','text_as_html','languages','filetype','partitioner_type']
# df=pd.DataFrame(columns=cols)
df=pd.read_csv('unstructured_df.csv')
df

INFO: NumExpr defaulting to 4 threads.


Unnamed: 0,type,element_id,text,metadata,filename,page_number,text_as_html,languages,filetype,partitioner_type
0,Title,"{'type': 'Title', 'element_id': '1b50c69fd4e27...",ArtPrompt: ASCII Art-based Jailbreak Attacks a...,"{'languages': ['heb', 'eng'], 'page_number': 1...",ArtPrompt- ASCII Art-based Jailbreak Attacks a...,1.0,,"['heb', 'eng']",application/pdf,
1,UncategorizedText,"{'type': 'UncategorizedText', 'element_id': '6...",♣,"{'languages': ['heb', 'eng'], 'page_number': 1...",ArtPrompt- ASCII Art-based Jailbreak Attacks a...,1.0,,"['heb', 'eng']",application/pdf,
2,NarrativeText,"{'type': 'NarrativeText', 'element_id': '8708f...",WARNING: This paper contains model outputs tha...,"{'languages': ['heb', 'eng'], 'page_number': 1...",ArtPrompt- ASCII Art-based Jailbreak Attacks a...,1.0,,"['heb', 'eng']",application/pdf,
3,UncategorizedText,"{'type': 'UncategorizedText', 'element_id': '4...",♣‡,"{'languages': ['heb', 'eng'], 'page_number': 1...",ArtPrompt- ASCII Art-based Jailbreak Attacks a...,1.0,,"['heb', 'eng']",application/pdf,
4,UncategorizedText,"{'type': 'UncategorizedText', 'element_id': '2...",♣‡,"{'languages': ['heb', 'eng'], 'page_number': 1...",ArtPrompt- ASCII Art-based Jailbreak Attacks a...,1.0,,"['heb', 'eng']",application/pdf,
...,...,...,...,...,...,...,...,...,...,...
2846,ListItem,"{'type': 'ListItem', 'element_id': 'c45eb708ec...",Document Revision Control:,"{'category_depth': 0, 'filename': 'Risk Manage...",Risk Management Plan - Bonded Manifolds.doc,,,['eng'],application/msword,
2847,Table,"{'type': 'Table', 'element_id': '666d7a3208420...",Revision History Tracking: Date: Revision 0- f...,{'emphasized_text_contents': ['Revision Histor...,Risk Management Plan - Bonded Manifolds.doc,,<table><tr><td>Revision History Tracking:</td>...,['eng'],application/msword,
2848,ListItem,"{'type': 'ListItem', 'element_id': 'beebf563ba...",Review and approval:,"{'category_depth': 0, 'filename': 'Risk Manage...",Risk Management Plan - Bonded Manifolds.doc,,,['eng'],application/msword,
2849,Table,"{'type': 'Table', 'element_id': 'a2f1fc0cec9c9...",Written by: Name Date Signature QA Eng. - - -,"{'emphasized_text_contents': ['Written by:', '...",Risk Management Plan - Bonded Manifolds.doc,,<table><tr><td>Written by:</td><td>Name</td><t...,"['eng', 'deu']",application/msword,


In [6]:

def add_file_to_df(file: list, df: pd.DataFrame):
    
    for element in file:
        # Check if the element already exists in the DataFrame
        if df[df['element_id'] == str(element)].empty:
            # If it doesn't exist, create a new row
            pass
        else:
            # If it exists, skip to the next element
            continue
        # Create a dictionary for the row
        row_dict = {
            'type': element['type'],
            'element_id': str(element),
            'text': element['text'],
            'metadata': element['metadata'] 
        }
        
        # Extract each metadata column if it exists
        for meta_col in metadata_cols:
            if meta_col in element['metadata']:
                row_dict[meta_col] = element['metadata'][meta_col]
            else:
                row_dict[meta_col] = None  # Set to None if metadata field doesn't exist
                
        # Using concat instead of append (which is deprecated)
        df = pd.concat([df, pd.DataFrame([row_dict])], ignore_index=True)
    return df
# Iterate over the list of files and add each to the DataFrame





In [9]:
for file in unstructerd_files:
    df = add_file_to_df(file, df)
    # for el in file:
        # print(el)


In [None]:
# df.to_csv('unstructured_df.csv', index=False)

In [10]:
len(df)

4187

In [19]:
import chromadb

client=chromadb.PersistentClient('chromedb')
collection_name='first_collection'
collection=client.get_or_create_collection(name=collection_name)
from chromadb.utils import embedding_functions
embed_model = "embed-v4.0" 
embedding_function=embedding_functions.CohereEmbeddingFunction(api_key=os.getenv("COHERE_API_KEY"), model_name=embed_model)
chunk_size=20
chunk_overlap=3

INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [28]:
(df.iloc[1].element_id)
import ast, json, textwrap

# Original string
s = df.iloc[1].element_id
s
import ast

# Python dictionary string


# Convert to Python dictionary
dict_object = ast.literal_eval(s)
# Or convert to JSON string first, then to JSON object
import json
json_object = json.loads(json.dumps(dict_object))
json_object

{'type': 'UncategorizedText',
 'element_id': '67f7ac4b605588d096605bc20444c036',
 'text': '♣',
 'metadata': {'languages': ['heb', 'eng'],
  'page_number': 1,
  'parent_id': '1b50c69fd4e278e6fc164601259be612',
  'filename': 'ArtPrompt- ASCII Art-based Jailbreak Attacks against Aligned LLMs.pdf',
  'filetype': 'application/pdf'}}

In [None]:
# def add_element_to_collection(element, collection):
    

In [301]:
def text_to_embedding(text):
    res=co.embed(
        model=embed_model,
        input_type="search_document",
        embedding_types=["float"],
        texts=[text],
    ).embeddings.float[0]
    return res
    

In [314]:
row=df.iloc[0]
# print(row['text'])
# text_to_embedding(row['text'])
row['metadata']['languages']=str(row['metadata']['languages'])
collection.add(
    documents=[row['filename']],
    metadatas=[row['metadata']],
    ids=[row['element_id']],
    embeddings=[text_to_embedding(row['text'])],
)

INFO: HTTP Request: POST https://api.cohere.com/v2/embed "HTTP/1.1 200 OK"


In [316]:
query="what is the name of the file?"
results=collection.query(
    query_embeddings=[text_to_embedding(query)],
    n_results=1,
    include=["documents", "metadatas", "distances"],
)
results

INFO: HTTP Request: POST https://api.cohere.com/v2/embed "HTTP/1.1 200 OK"


{'ids': [["{'type': 'Title', 'element_id': '1b50c69fd4e278e6fc164601259be612', 'text': 'ArtPrompt: ASCII Art-based Jailbreak Attacks against Aligned LLMs', 'metadata': {'languages': ['heb', 'eng'], 'page_number': 1, 'filename': 'ArtPrompt- ASCII Art-based Jailbreak Attacks against Aligned LLMs.pdf', 'filetype': 'application/pdf'}}"]],
 'embeddings': None,
 'documents': [['ArtPrompt- ASCII Art-based Jailbreak Attacks against Aligned LLMs.pdf']],
 'uris': None,
 'included': ['documents', 'metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'page_number': 1,
    'languages': "['heb', 'eng']",
    'filetype': 'application/pdf',
    'filename': 'ArtPrompt- ASCII Art-based Jailbreak Attacks against Aligned LLMs.pdf'}]],
 'distances': [[1.593124508857727]]}