In [79]:
import os
import re
import docx

def extract_references_section(doc_text, occurrence):
    # Define the pattern for extracting the References section
    pattern = re.compile(r'\b2\s+References\b.*?(\b\d+\s+|$)', re.DOTALL)

    # Find all occurrences of the References section using the pattern
    matches = list(re.finditer(pattern, doc_text))

    # Check if the specified occurrence is within the valid range
    if 1 <= occurrence <= len(matches):
        start_index = matches[occurrence - 1].start()

        # Look for the header indicating the end of the References section
        end_marker = re.search(r'\b3\s\b', doc_text[start_index:])
        end_index = end_marker.start() + start_index if end_marker else None

        references_section = doc_text[start_index:end_index].strip()

        # Extract lines starting with positive numbers in square brackets
        lines = references_section.split('\n')
        relevant_lines = [line.strip() for line in lines if re.match(r'\[\d+\]', line)]

        return relevant_lines
    else:
        return f"References section occurrence {occurrence} not found."

def read_docx(file_path):
    # Open the DOCX file
    doc = docx.Document(file_path)

    # Extract text from paragraphs
    paragraphs = [paragraph.text for paragraph in doc.paragraphs]

    # Combine paragraphs into a single string
    doc_text = '\n'.join(paragraphs)

    return doc_text

def process_folder(input_folder, output_folder, occurrence_to_extract):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all files in the input folder
    files = [f for f in os.listdir(input_folder) if f.endswith(".docx")]

    for file_name in files:
        file_path = os.path.join(input_folder, file_name)

        # Read the DOCX file
        doc_text = read_docx(file_path)

        # Extract the specified occurrence of the References section
        result = extract_references_section(doc_text, occurrence_to_extract)

        # Define the output file path in the output folder
        output_file_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_references.txt")

        # Write the extracted references to a text file with UTF-8 encoding
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            if isinstance(result, str):
                output_file.write(result)
            else:
                for line in result:
                    output_file.write(line + '\n')

        print(f"References extracted from {file_name} and saved to {output_file_path}")

# Replace 'input_folder' and 'output_folder' with the actual folder paths
input_folder = r"D:\Work\IIT Bhilai\Internship\Specifications\series_12\docx"
output_folder = r"D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All"
occurrence_to_extract = 2

# Process the entire folder
process_folder(input_folder, output_folder, occurrence_to_extract)


References extracted from 1200-460.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All\1200-460_references.txt
References extracted from 1201_441.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All\1201_441_references.txt
References extracted from 1202_501.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All\1202_501_references.txt
References extracted from 1203-800.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All\1203-800_references.txt
References extracted from 1204-810.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All\1204-810_references.txt
References extracted from 1205_701.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All\1205_701_references.txt
References extracted from 1206_411.docx and saved to D:\Work\IIT Bhilai\Internship\Specifications\series_12\References

In [80]:
import os

# Define the categories
gpp_references = set()
other_references = set()

# Specify the folder containing TXT files
folder_path = r'D:\Work\IIT Bhilai\Internship\Specifications\series_12\References\All'  # Change this to your actual folder path

# Get the output folder from the user
output_folder = r'D:\Work\IIT Bhilai\Internship\Specifications\series_12\References' 

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through all TXT files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)

        # Read the references from the current text file
        with open(file_path, 'r', encoding='utf-8') as file:
            references = file.readlines()

        # Separate references into categories and add to sets
        for reference in references:
            reference = reference.strip()

            # Remove the [x] from the reference
            reference = reference.split(']')[1].strip() if reference.startswith('[') else reference

            if '3GPP' in reference:
                gpp_references.add(reference)
            else:
                other_references.add(reference)

# Print the results
print("3GPP References:")
for idx, reference in enumerate(gpp_references, start=1):
    print(f"{idx}. {reference}")

print("\nOther References:")
for idx, reference in enumerate(other_references, start=1):
    print(f"{idx}. {reference}")

# Output file paths within the user-specified folder
gpp_output_path = os.path.join(output_folder, 'gpp_references.txt')
other_output_path = os.path.join(output_folder, 'other_references.txt')

# Save the results to files
with open(gpp_output_path, 'w', encoding='utf-8') as gpp_file:
    for idx, reference in enumerate(gpp_references, start=1):
        gpp_file.write(f"{idx}. {reference}\n")

with open(other_output_path, 'w', encoding='utf-8') as other_file:
    for idx, reference in enumerate(other_references, start=1):
        other_file.write(f"{idx}. {reference}\n")

# Print a message indicating the saved files
print(f"\n3GPP References saved to: {gpp_output_path}")
print(f"Other References saved to: {other_output_path}")


3GPP References:

Other References:
1. CCITT X.735 (ISO/IEC 10164-6): "Information technology - Open Systems Interconnection - Systems Management: Log Control Function".
2. CCITT X.730 (ISO/IEC 10164-1): "Information technology - Open Systems Interconnection - Systems Management: Object Management Function".
3. GSM 04.07: "Digital cellular telecommunications system (Phase 2+); Mobile radio interface signalling layer 3; General aspects".
4. ETR 100 (GSM 01.04): Digital cellular telecommunications system (Phase 2+); "Abbreviations and acronyms".
5. GSM 04.71: Digital cellular telecommunications system (Phase 2+), Mobile radio interface layer
6. GSM 12.22 (ETS 300 624): "Digital cellular telecommunication system (Phase 2); Interworking of GSM Network Management (NM) procedures and messages at the Base Station Controller (BSC)".
7. ITU-T Recommendation M.3400 (1992): "TMN Management Functions".
8. ITU-T Recommendation X.722 (1992)  ISO/IEC 10165-4: "Information technology - Open Systems In

FOR ALL References

In [None]:
import os

# Define the categories
gpp_references = []
other_references = []

# Specify the folder containing TXT files
folder_path = r'D:\Work\IIT Bhilai\Internship\Specifications\series_33\References'  # Change this to your actual folder path

# Iterate through all TXT files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)

        # Read the references from the current text file
        with open(file_path, 'r', encoding='utf-8') as file:
            references = file.readlines()

        # Separate references into categories
        for reference in references:
            if '3GPP' in reference:
                gpp_references.append(reference.strip())
            else:
                other_references.append(reference.strip())

# Print the results
print("3GPP References:")
for idx, reference in enumerate(gpp_references, start=1):
    print(f"{idx}. {reference}")

print("\nOther References:")
for idx, reference in enumerate(other_references, start=1):
    print(f"{idx}. {reference}")


For cleaning duplicate lines

In [None]:
def process_file(input_file, output_file):
    lines_seen = set()

    with open(output_file, 'w', encoding='utf-8') as output, open(input_file, 'r', encoding='utf-8') as input:
        for i, line in enumerate(input, start=1):
            # Removing leading and trailing whitespaces
            clean_line = line.strip()

            # Skip empty lines
            if not clean_line:
                continue

            # Skip if line is a duplicate
            if clean_line not in lines_seen:
                output.write(f"{i}. {clean_line}\n")
                lines_seen.add(clean_line)

# Example usage:
input_file_path = r'D:\Work\IIT Bhilai\Internship\Specifications\External Links.txt'
output_file_path = r'D:\Work\IIT Bhilai\Internship\Specifications\External Links_real.txt'

process_file(input_file_path, output_file_path)


For better visual(removed the useless numbering)

In [None]:
# Specify the path to your input text file
input_file_path = r'D:\Work\IIT Bhilai\Internship\Specifications\External files list.txt'

# Specify the path to your output text file
output_file_path = r'D:\Work\IIT Bhilai\Internship\Specifications\External files.txt'

# Read the content from the input file
with open(input_file_path, 'r', encoding='utf-8') as file:
    input_text = file.read()

# Split the text by lines
lines = input_text.split('\n')

# Process each line
output_lines = []
for line in lines:
    # Find the first period (.)
    first_period_index = line.find('.')
    
    # Find the second period (.) starting from the position after the first period
    second_period_index = line.find('.', first_period_index + 1)
    
    # If both periods are found, remove the content between them
    if first_period_index != -1 and second_period_index != -1:
        modified_line = line[:first_period_index + 1] + line[second_period_index + 1:]
    else:
        # If one of the periods is not found, keep the original line
        modified_line = line
    
    output_lines.append(modified_line)

# Join the modified lines back into a single text
output_text = '\n'.join(output_lines)

# Write the modified content back to the output file
with open(output_file_path, 'w' ,encoding='utf-8') as file:
    file.write(output_text)

# Print the modified content
print(output_text)
