In [1]:
import fitz  # PyMuPDF
import PyPDF2
from PIL import Image
import pytesseract
import extract_msg
import json
import os
from langdetect import detect, DetectorFactory

In [2]:
# Set the seed for language detection to ensure consistent results
DetectorFactory.seed = 0

In [3]:
# Define the path to the folder containing documents
folder_path = './docs/'

In [4]:
def preprocess_image(image):
    """
    Preprocess the image to improve OCR accuracy.
    - Convert to grayscale
    - Apply binarization

    Args:
        image (PIL.Image): The image to preprocess.

    Returns:
        PIL.Image: The preprocessed image.
    """
    gray = image.convert('L')  # Convert to grayscale
    binarized = gray.point(lambda x: 0 if x < 128 else 255, '1')  # Binarization
    return binarized

In [5]:
def extract_text_with_ocr(page):
    """
    Extract text from a PDF page using OCR.

    Args:
        page (fitz.Page): The PDF page to extract text from.

    Returns:
        str: The extracted text.
    """
    pix = page.get_pixmap(dpi=300)  # Increase DPI for better OCR accuracy
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Preprocess the image before OCR
    preprocessed_img = preprocess_image(img)
    text = pytesseract.image_to_string(preprocessed_img)
    return text

In [6]:
def extract_text_from_pdf(file_path):
    """
    Extracts text from a PDF file using a hybrid approach.
    Tries direct text extraction with PyPDF2 first, and falls back to OCR if necessary.

    Args:
        file_path (str): The path to the PDF file.

    Returns:
        str: The extracted text from the PDF file.
    """
    full_text = ''
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text = page.extract_text()

                if text and text.strip():  # If text extraction is successful
                    full_text += text.strip() + " "
                else:
                    # If PyPDF2 fails to extract text, use OCR as a fallback
                    with fitz.open(file_path) as document:
                        fitz_page = document.load_page(page_num)
                        ocr_text = extract_text_with_ocr(fitz_page)
                        full_text += ocr_text.strip() + " "
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
    return full_text.strip()


In [7]:
def read_msg(file_path):
    """
    Extracts text from a .msg file.

    Args:
        file_path (str): The path to the .msg file.

    Returns:
        str: The extracted text from the .msg file.
    """
    try:
        msg = extract_msg.Message(file_path)
        return msg.body.strip()
    except Exception as e:
        print(f"Error reading MSG {file_path}: {e}")
        return ""

In [8]:
def detect_language(text):
    """
    Detects the language of the given text.

    Args:
        text (str): The text to detect the language for.

    Returns:
        str: The detected language code, or "unknown" if detection fails.
    """
    try:
        return detect(text)
    except Exception:
        return "unknown"

In [9]:
def process_files_in_folder(folder_path):
    """
    Processes all files in a given folder, extracting text and language information.

    Args:
        folder_path (str): The path to the folder containing files.

    Returns:
        list: A list of dictionaries, each containing file information and extracted text.
    """
    all_documents_data = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            file_extension = os.path.splitext(file_path)[1].lower()
            document_content = ""

            if file_extension == '.pdf':
                document_content = extract_text_from_pdf(file_path)
            elif file_extension == '.msg':
                document_content = read_msg(file_path)

            if document_content:
                detected_language = detect_language(document_content)
                all_documents_data.append({
                    'file_name': file_name,
                    'file_type': file_extension,
                    'language': detected_language,
                    'content': document_content
                })

    return all_documents_data

In [10]:
def save_to_json(data, output_file):
    """
    Saves data to a JSON file.

    Args:
        data (list): The data to save.
        output_file (str): The path to the output JSON file.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data has been saved to {output_file}")

In [11]:
documents_data = process_files_in_folder(folder_path)
documents_data

[{'file_name': '2023-08-28-Pool-SystemsGmbH＆CoKG-Tschechien-RegistrierungsbestätigungderSteuerbehörden.pdf',
  'file_type': '.pdf',
  'language': 'de',
  'content': 'Finanéni tad pro Moravskoslezsky kraj V Ostravé\nNa Jizdarné 3162/3 dne\n\n709 Q0 MORAVSKA OSTRAVA A PRIVOZ\n\nElektronicky podepsano\n\nUzemni pracovisté Ostrava I 11. 08. 2023\nJureckova 940/2— . Ing. Pavla Jirsova\n\n700 39 MORAVSKA OSTRAVA A PRIVOZ rada\n\nCj.: 3403424/23/3201-00512-810298\n\nVyrizuje: Jirsova Pavla, Ing.\n\nOddéleni registraéni II\n\nTelefon: 596 150 282 €. dvefi: 217a/B EINGEGANGEN AM = 17 AUG. 2023\n\nE-mail: podatelna320l@fs.mfcr.cz\nDafiovy subjekt\n\nDIC: CZ686682578\n\nPOOL-SYSTEMS GmbH & Co.KG\nAm Steinbruch 3-5\n\n92559 WINKLARN\n\nSpolkova republika Némecko\n\nROZHODNUTI\n0 registraci k dani z pfidané hodnoty\n\nShora uvedeny spravce dané Vas podle ust. § 129 odst. 1 zadkona\né. 280/2009 Sb., dahovy fad, ve znéni pozdéjsich pfedpisd (dale jen\n"dafovy fad"), ma zdkladé pfihldsky k registraci 

In [12]:
word_counts = [len(entry['content'].split()) for entry in documents_data]
word_counts

[956,
 695,
 462,
 438,
 178,
 241,
 249,
 273,
 631,
 95,
 1409,
 1537,
 799,
 232,
 182,
 181]

In [13]:
average_word_count = sum(word_counts) / len(word_counts)
average_word_count

534.875

In [14]:
for i, count in enumerate(word_counts):
    print(f"Word count in content of file {documents_data[i]['file_name']}: {count}")

print(f"Average word count in content: {average_word_count:.2f}")


Word count in content of file 2023-08-28-Pool-SystemsGmbH＆CoKG-Tschechien-RegistrierungsbestätigungderSteuerbehörden.pdf: 956
Word count in content of file 2022-09-26-SteuerbehördeNiederlande-Scan2022-09-26_103629.pdf: 695
Word count in content of file 2024-04-12-FJTradingGmbH-WGNouveaumessagedelaDirectionGénéraledesFinancesPubliques-17207401@taxhubeu.msg: 462
Word count in content of file 2021-09-22-EcomBrandsGmbH-SCN_2021_09_22_06_27_55_001.pdf: 438
Word count in content of file 2024-01-09-LeGouvernementduGrand-DuchédeLuxembourg-Scan2024-01-09_101639.pdf: 178
Word count in content of file 2023-01-16-FinanzamtÖsterreich-Scan2023-01-16_095504.pdf: 241
Word count in content of file 2024-04-12-IVYOAKGmbH-20240327145254513.pdf: 249
Word count in content of file 2024-02-01-AgenciaTributaria-BehördeschreibenPlastiksteuerRegistrierungspflichtGloryfeeelGmbH.pdf: 273
Word count in content of file 2023-10-16-ArteDhioGmbH-SchreibenFinanzamtPolen_919.pdf: 631
Word count in content of file 202

In [15]:
import numpy as np
from scipy import stats

# Sample data

# Calculate mean and standard error
mean_word_count = np.mean(word_counts)
standard_error = stats.sem(word_counts)

# Calculate 95% confidence interval
confidence_interval = stats.t.interval(0.95, len(word_counts)-1, loc=mean_word_count, scale=standard_error)

print(f"Mean Word Count: {mean_word_count:.2f}")
print(f"95% Confidence Interval: {confidence_interval}")


Mean Word Count: 534.88
95% Confidence Interval: (298.00464275094083, 771.7453572490592)


In [16]:

# Number of bootstrap samples
n_bootstrap_samples = 10000

# Bootstrap samples of means
bootstrap_means = [np.mean(np.random.choice(word_counts, size=len(word_counts), replace=True)) for _ in range(n_bootstrap_samples)]

# Calculate 95% confidence interval from bootstrap means
bootstrap_confidence_interval = np.percentile(bootstrap_means, [2.5, 97.5])

print(f"Bootstrap Mean Word Count: {np.mean(bootstrap_means):.2f}")
print(f"Bootstrap 95% Confidence Interval: {bootstrap_confidence_interval}")


Bootstrap Mean Word Count: 535.12
Bootstrap 95% Confidence Interval: [342.1859375 757.9484375]


In [17]:
output_file = 'extracted_data.json'
save_to_json(documents_data, output_file)

Data has been saved to extracted_data.json
