Preprocessing data and storing in a file

In [10]:
from docx import Document
import os

def preprocess_docx(file_path, output_txt_path):
    # Load the DOCX file
    doc = Document(file_path)

    # Extract file name
    file_name = os.path.basename(file_path)

    # Initialize variables to store heading and content
    current_heading = None
    content_by_heading = {}

    # Flag to skip specific content
    skip_content = False

    # Process paragraphs
    for paragraph in doc.paragraphs:
        # Skip specified content
        if "Foreword" in paragraph.text:
            skip_content = True
            continue
        elif "Version" in paragraph.text:
            skip_content = False
            continue

        # Process paragraphs when not skipping content
        if not skip_content:
            if paragraph.style.name.startswith('Heading'):
                current_heading = paragraph.text
                content_by_heading[current_heading] = []
            elif current_heading is not None:
                content_by_heading[current_heading].append(paragraph.text)

    # Print and save the output
    with open(output_txt_path, 'w', encoding='utf-8') as output_file:
        output_file.write(f'File Name: {file_name}\n\n')

        for heading, content in content_by_heading.items():
            output_file.write(f'{heading}\n')
            output_file.write('\n'.join(content))
            output_file.write('\n\n')

    print(f'0       {file_name}\n')
    for heading, content in content_by_heading.items():
        print(f'{heading}')
        print('\n'.join(content))
        print()

# Example usage
docx_file_path = r'D:\Work\IIT Bhilai\Internship\Specifications\series_33\Original\33108-h20.docx'  # Replace with the path to your DOCX file
output_txt_path = r'D:\Work\IIT Bhilai\Internship\Specifications\series_33\ExtractedText\33108-h20_extracted.txt'  # Replace with the desired output text file path
preprocess_docx(docx_file_path, output_txt_path)


0       33108-h20.docx

Introduction
This Technical Specification has been produced by 3GPP TSG SA to allow for the standardization in the area of lawful interception of telecommunications. This document addresses the handover interfaces for lawful interception of Packet-Data Services, Circuit Switched Services, Multimedia Services within the Universal Mobile Telecommunication System (UMTS) and Evolved Packet System (EPS). The specification defines the handover interfaces for delivery of lawful interception Intercept Related Information (IRI) and Content of Communication (CC) to the Law Enforcement Monitoring Facility.
Laws of individual nations and regional institutions (e.g. European Union), and sometimes licensing and operating conditions define a need to intercept telecommunications traffic and related information in modern telecommunications systems. It has to be noted that lawful interception shall always be done in accordance with the applicable national or regional laws and tec

Chunk creation till Sub-sub Heading(x.x.x)

In [11]:
import re

# Read text from the input file
with open(r'D:\Work\IIT Bhilai\Internship\Specifications\series_33\ExtractedText\33108-h20_extracted.txt', 'r') as file:
    text = file.read()

# Split the text into sections using regex for specific X, X.X, or X.X.X format
sections = re.split(r'\n(?=\d+(\.\d+){0,2}\s)(?!\.\d)', text)

# Print non-empty and relevant sections
for section in sections:
    if section and section.strip() and not re.match(r'^\.?\d+\s*$', section.strip()):
        print(section.strip())
        print("=" * 50)  # Separator for better visualization

File Name: 33108-h20.docx

Introduction
This Technical Specification has been produced by 3GPPÂ TSGÂ SA to allow for the standardization in the area of lawful interception of telecommunications. This document addresses the handover interfaces for lawful interception of Packet-Data Services, Circuit Switched Services, Multimedia Services within the Universal Mobile Telecommunication System (UMTS) and Evolved Packet System (EPS). The specification defines the handover interfaces for delivery of lawful interception Intercept Related Information (IRI) and Content of Communication (CC) to the Law Enforcement Monitoring Facility.
Laws of individual nations and regional institutions (e.g. European Union), and sometimes licensing and operating conditions define a need to intercept telecommunications traffic and related information in modern telecommunications systems. It has to be noted that lawful interception shall always be done in accordance with the applicable national or regional laws an