<a href="https://colab.research.google.com/github/phuocnguyen90/Random-projects/blob/main/doc_to_docx_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
%%capture
!pip install -U olefile
!pip install python-docx
!pip install pypandoc python-docx
!pip install docxcompose
!sudo apt install libreoffice



In [31]:
import zipfile
import os
import tempfile
import subprocess
from docx import Document
from docxcompose.composer import Composer

def doc_to_docx_pipeline(input_doc_path, output_docx_path):
    """Pipeline to convert .doc to .docx and append OLE content."""

    # Step 1: Convert the original .doc file to .docx
    main_docx_content = doc_to_docx(input_doc_path)
    if main_docx_content is None:
        print(f"Failed to convert the original .doc file: {input_doc_path}")
        return

    # Save the converted main .docx to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_main_docx:
        tmp_main_docx.write(main_docx_content)
        main_docx_path = tmp_main_docx.name

    # Step 2: Extract and process OLE objects
    ole_objects = extract_ole_objects(main_docx_path)

    # Convert and collect embedded .doc content to append
    appended_docs = []
    for i, ole_content in enumerate(ole_objects):
        # Convert the extracted .doc or .zip containing .doc files
        extracted_docs = process_ole_content(ole_content)
        for extracted_doc_content in extracted_docs:
            appended_docs.append(extracted_doc_content)

    # Step 3: Append the extracted contents to the main document
    if appended_docs:
        combined_doc = append_documents(main_docx_path, appended_docs)
        # Save the combined document to the specified output path
        combined_doc.save(output_docx_path)
        print(f"Combined document saved at: {output_docx_path}")
    else:
        print("No content to append.")
        # Save the original .docx content without changes
        with open(output_docx_path, 'wb') as f:
            f.write(main_docx_content)

def doc_to_docx(input_doc_path):
    """Convert a .doc file to .docx using LibreOffice."""
    with tempfile.TemporaryDirectory() as tmpdirname:
        # Copy the input .doc file to the temporary directory
        doc_path = os.path.join(tmpdirname, 'input.doc')
        with open(doc_path, 'wb') as doc_file:
            with open(input_doc_path, 'rb') as f:
                doc_file.write(f.read())

        # Convert the .doc file to .docx using LibreOffice
        try:
            subprocess.run(['soffice', '--headless', '--convert-to', 'docx', doc_path, '--outdir', tmpdirname],
                           check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            print(f"Error during conversion: {e}")
            return None

        # Read the converted .docx content
        docx_path = os.path.join(tmpdirname, 'input.docx')
        if os.path.exists(docx_path):
            with open(docx_path, 'rb') as docx_file:
                return docx_file.read()
        else:
            print("Conversion failed: .docx file not found.")
            return None

def extract_ole_objects(docx_path):
    """Extract OLE objects from a .docx file."""
    ole_objects = []
    with zipfile.ZipFile(docx_path, 'r') as docx:
        # Locate embedded OLE objects
        for item in docx.namelist():
            if item.startswith('word/embeddings'):
                # Extract the OLE object file
                with docx.open(item) as file:
                    ole_objects.append(file.read())
    return ole_objects

def process_ole_content(ole_content):
    """Process OLE content, handle both .doc files and .zip containing .doc files."""
    processed_docs = []

    with tempfile.TemporaryDirectory() as tmpdirname:
        # Save the OLE content to a temporary file
        ole_path = os.path.join(tmpdirname, 'embedded_object')
        with open(ole_path, 'wb') as ole_file:
            ole_file.write(ole_content)

        # Try to handle OLE content as a .doc file directly
        try:
            # Convert the binary content directly using the helper function
            docx_content = doc_to_docx(ole_path)
            if docx_content:
                processed_docs.append(docx_content)
        except Exception as e:
            print(f"Failed to convert as a .doc file: {e}")

        # If the above fails, try to handle OLE content as a .zip file containing .doc files
        if not processed_docs and zipfile.is_zipfile(ole_path):
            try:
                with zipfile.ZipFile(ole_path, 'r') as zip_file:
                    for item in zip_file.namelist():
                        if item.endswith('.doc'):
                            with zip_file.open(item) as doc_file:
                                doc_content = doc_file.read()
                                # Save the content to a temporary .doc file
                                with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_doc:
                                    temp_doc.write(doc_content)
                                    temp_doc.flush()
                                    # Convert the extracted .doc to .docx
                                    docx_content = doc_to_docx(temp_doc.name)
                                    if docx_content:
                                        processed_docs.append(docx_content)
            except Exception as e:
                print(f"Failed to process the OLE as a .zip file: {e}")

    return processed_docs


def append_documents(main_doc_path, appended_docs):
    """Append documents to the main document and return the combined document."""
    # Load the main document
    main_doc = Document(main_doc_path)
    composer = Composer(main_doc)

    # Append each document to the main document
    for doc_content in appended_docs:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_docx:
            tmp_docx.write(doc_content)
            tmp_docx.flush()
            tmp_doc = Document(tmp_docx.name)
            composer.append(tmp_doc)

    return main_doc




Combined document saved at: /content/combined_output_with_ole.docx


In [None]:
# Example usage
input_doc_path = '/content/1.5. 145_2020_ND-CP_459400.doc'
output_docx_path = '/content/combined_output_with_ole.docx'

doc_to_docx_pipeline(input_doc_path, output_docx_path)

In [None]:
import os
import shutil

def convert_all_docs(src_folder, dest_folder):
    """Convert all .doc files in the source folder (including subfolders) to .docx,
    saving converted files with and without appended OLE objects in the destination folder.

    Args:
    - src_folder: The source directory containing .doc files.
    - dest_folder: The destination directory to save converted .docx files.
    """

    # Traverse the directory structure
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.endswith('.doc'):
                # Construct full file paths
                src_file_path = os.path.join(root, file)

                # Create a relative path and maintain directory structure
                relative_path = os.path.relpath(root, src_folder)
                dest_subfolder = os.path.join(dest_folder, relative_path)
                os.makedirs(dest_subfolder, exist_ok=True)

                # Define paths for the output .docx files
                base_filename = os.path.splitext(file)[0]
                converted_without_appendix = os.path.join(dest_subfolder, base_filename + '_converted.docx')
                converted_with_appendix = os.path.join(dest_subfolder, base_filename + '_converted_with_appendix.docx')

                # Step 1: Convert the original .doc file to .docx (without appendix)
                main_docx_content = doc_to_docx(src_file_path)
                if main_docx_content:
                    # Save the converted file without appendix
                    with open(converted_without_appendix, 'wb') as f:
                        f.write(main_docx_content)

                    # Step 2: Process the OLE objects and append to the document
                    ole_objects = extract_ole_objects(converted_without_appendix)
                    appended_docs = []
                    for i, ole_content in enumerate(ole_objects):
                        # Convert and collect embedded .doc content to append
                        extracted_docs = process_ole_content(ole_content)
                        for extracted_doc_content in extracted_docs:
                            appended_docs.append(extracted_doc_content)

                    if appended_docs:
                        # Append extracted OLE content to the original .docx content
                        combined_doc = append_documents(converted_without_appendix, appended_docs)
                        # Save the combined document with appendix
                        combined_doc.save(converted_with_appendix)
                    else:
                        # If there are no OLE objects, simply copy the file without appendix
                        shutil.copy(converted_without_appendix, converted_with_appendix)

                    print(f"Converted and saved: {converted_without_appendix} and {converted_with_appendix}")
                else:
                    print(f"Failed to convert {src_file_path}")

    print("Conversion process completed.")

# Example usage
src_folder = '/path/to/source_folder'
dest_folder = '/path/to/destination_folder'

convert_all_docs(src_folder, dest_folder)
