In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm  # Correct import
import fitz  # PyMuPDF
import socket
import importlib
from aux_download_pdfs import download_pdfs_from_page, create_urls

In [1]:
import os
import pandas as pd

def count_files_in_folders(base_path):
    """
    Count the number of files with a specific extension in each folder.

    Args:
        path (str): The directory to scan.
        extension (str): The file extension to count (e.g., ".pdf" or ".txt").

    Returns:
        dict: A dictionary where the keys are folder names and the values are the file counts.
    """
    folder_file_count = {}

    # Walk through the directory
    for root, dirs, files in os.walk(base_path):
        for folder in dirs:
            # Get the path of the folder
            folder_path = os.path.join(root, folder)
            
            # Count the number of files in the folder
            num_files = len([file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))])
            
            # Store the folder name and file count in the dictionary
            folder_file_count[folder] = num_files

    return folder_file_count

pdf_path = '/media/pablo/windows_files/00 - Master/05 - Research&Thesis/R2-Research_Internship_2/02-data/pdfs/'
pdfs_counts = count_files_in_folders(pdf_path)

txt_path = '/media/pablo/windows_files/00 - Master/05 - Research&Thesis/R2-Research_Internship_2/02-data/pdfs_txt/'
txt_counts = count_files_in_folders(txt_path)

# Create a pandas DataFrame from the folder counts dictionary
df = pd.DataFrame(list(pdfs_counts.items()), columns=['Folder', 'Pdf Count'])

# Add Txt Count by matching the folder names from txt_counts
df['Txt Count'] = df['Folder'].map(txt_counts)

# Sort the DataFrame by 'File Count' in descending order
df_sorted = df.sort_values(by='Pdf Count', ascending=False)

# Save the sorted DataFrame to a CSV file in the same path
output_path = os.path.join(pdf_path, '../label_counts.csv')
df_sorted.to_csv(output_path, index=False)

# Output the sorted DataFrame
df_sorted

Unnamed: 0,Folder,Pdf Count,Txt Count
1,controller-accessories,251,
25,batteries-non-rechargeable-primary,234,
11,speakers,200,
26,batteries-rechargeable-secondary,182,
5,microphones,164,
27,battery-chargers,137,
18,alarms-buzzers-and-sirens,133,
13,task-lighting,95,
9,rack-accessories,79,
19,aluminum-electrolytic-capacitors,71,


In [14]:
import os

def find_missing_files(pdf_path, txt_path):
    """
    Finds files that don't appear in both pdf_path and txt_path directories.
    Compares the file names without extensions.

    Args:
        pdf_path (str): Directory containing PDF files.
        txt_path (str): Directory containing TXT files.
    
    Returns:
        missing_in_txt (list): List of PDFs that don't have a corresponding TXT file.
        missing_in_pdf (list): List of TXTs that don't have a corresponding PDF file.
    """
    # Get the list of PDF files and remove their extensions
    pdf_files = {os.path.splitext(f)[0] for f in os.listdir(pdf_path) if f.endswith(".pdf")}
    # Get the list of TXT files and remove their extensions
    txt_files = {os.path.splitext(f)[0] for f in os.listdir(txt_path) if f.endswith(".txt")}
    
    # Find PDFs without corresponding TXT files and vice versa
    missing_in_txt = pdf_files - txt_files  # PDFs that don't have TXT counterparts
    missing_in_pdf = txt_files - pdf_files  # TXTs that don't have PDF counterparts
    
    return missing_in_txt, missing_in_pdf

pdf_path = "/media/pablo/windows_files/00 - Master/05 - Research&Thesis/R2-Research_Internship_2/02-data/pdfs/coaxial-cables-rf/"
txt_path = "/media/pablo/windows_files/00 - Master/05 - Research&Thesis/R2-Research_Internship_2/02-data/pdfs_txt/coaxial-cables-rf/"

missing_in_txt, missing_in_pdf = find_missing_files(pdf_path, txt_path)

print("PDF files missing a corresponding TXT file:", missing_in_txt)
print("TXT files missing a corresponding PDF file:", missing_in_pdf)


PDF files missing a corresponding TXT file: {'lmr-400-uf-coax-cables-datasheet', 'lmr-240-uf-coax-cables-datasheet', 'lpa-500-llpl-coax-cables-assemblies-datasheet', 'LMR-400', 'lmr-240-db-coax-cables-datasheet'}
TXT files missing a corresponding PDF file: {'1855A_techdata.pdf', '9223_techdata.pdf'}


In [1]:
from datasets import load_dataset
from transformers import BloomTokenizerFast, BloomForCausalLM

#valid_dataset = load_dataset('totto', split='validation')
# Load a CSV file
ds = load_dataset('csv', data_files='/media/pablo/windows_files/00 - Master/05 - Research&Thesis/R2-Research_Internship_2/02-data/csv/test.csv')
print(ds)
from aux_preprocess import preprocess # This file is included in the repo

# Now we linearize the tables
valid_dataset = ds.map(preprocess) 

model_ckpt = "mrm8488/bloom-560m-finetuned-totto-table-to-text"

tokenizer = BloomTokenizerFast.from_pretrained(model_ckpt)
model = BloomForCausalLM.from_pretrained(model_ckpt).to("cuda")


def explain_hl_cells(text):
    inputs = tokenizer(text, return_tensors='pt')
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=2048, eos_token_id=tokenizer.eos_token_id)

    return tokenizer.decode(output[0], skip_special_tokens=False)

example = valid_dataset[1]

print(explain_hl_cells(example['linearized_table']))

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
import importlib
import aux_pdf_to_text as aux
importlib.reload(aux)
input_path = '../02-data/pdfs/0-testing/'
output_path = '../02-data/txts/0-testing/'

aux.process_folder(input_path,output_path)

Processing PDFs: 100%|██████████| 2/2 [00:33<00:00, 16.71s/file]


In [6]:
import fitz
import os

def extract_images_from_pdf_fitz(pdf_path, pdf_name, output_folder):
    """
    Extracts images from each page in the PDF and saves them in the specified folder.

    Args:
        pdf_path (str): Path to the PDF file.
        pdf_name (str): Base name for the output files.
        output_folder (str): Folder where extracted images will be saved.
    """
    os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

    pdf = fitz.open(pdf_path)
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            img_data = base_image["image"]
            img_ext = base_image["ext"]

            # Define the output path and save the image data
            img_path = os.path.join(output_folder, f"{pdf_name}_page{page_num + 1}_img{img_index + 1}.{img_ext}")
            with open(img_path, "wb") as img_file:
                img_file.write(img_data)
            print(f"Saved image to {img_path}")

    pdf.close()

pdf_path = '../02-data/pdfs/0-testing/pdf_file_4137.pdf'
pdf_name = "pdf_file_4137"
output_folder = '../02-data/txts/0-testing/'

pdf_output_folder = os.path.join(output_folder, pdf_name)
extract_images_from_pdf_fitz(pdf_path, pdf_name, pdf_output_folder)