# Transforming all doc files in the 3GPP download to the docx format

The files can be in the .zip file or anywhere else

In [4]:
# first, we need to get the list of all .doc and .zip files in the 3GPP folder and all of its subfolders
import os

# this function will return a list of all .doc and .zip files in the 3GPP folder and all of its subfolders
def get_files(strFolder):
    files2 = []
    for root, dirs, files in os.walk(strFolder):
        for file in files:
            if file.endswith(".doc"):
                files2.append([root, file])
    return files2

In [2]:
# for the conversion, we use the uno framework from the libreoffice package
# it requires additional installation of the unoconv package
# to install it, run the following command in the terminal:
# sudo apt-get install unoconv

import subprocess

def convert_doc_to_docx(doc_file):
    # Use unoconv to convert .doc to .docx
    subprocess.run(["unoconv", "-f", "docx", doc_file], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [6]:
# first, let's collect all the .zip and .doc and .rtf files
str3GPPFolder = '/mnt/d/Specs'

files = get_files(str3GPPFolder)

print(len(files))

450


In [5]:
# save that list to a file
with open('./3GPP_files.txt', 'w') as f:
    for item in files:
        f.write("%s\n" % item)

In [9]:
# then, let's convert them

total = len(files)
iCounter = 0

# if the file is a .doc file, convert it to .docx
for file in files:
    if file[1].endswith(".doc") or file[1].endswith(".rtf"):
        # check if the same file with .docx exists
        if not os.path.exists(os.path.join(file[0], file[1] + 'x')):
             
            convert_doc_to_docx(os.path.join(file[0], file[1]))
        # print(file[0], file[1])
        iCounter += 1

    # print status every 100 files
    if iCounter % 100 == 0:
        print(f'{iCounter} of {total} files processed')


100 of 450 files processed
200 of 450 files processed


In [6]:
# now, if a file is a .zip file, then we need to extract it to a folder with the same name as the .zip file
# and then convert all .doc files in the folder to .docx files
# we can use the zipfile module to extract the .zip file
# and then use the os module to convert the .doc files to .docx files
import zipfile

def extract_zip(zip_file):
    # extract the .zip file to a folder with the same name as the .zip file
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(os.path.splitext(zip_file)[0])

    # get all .doc files in the folder
    files = get_files(os.path.splitext(zip_file)[0])

    # convert all .doc files to .docx files
    for file in files:
        if file[1].endswith(".doc") or file[1].endswith(".rtf"):
            convert_doc_to_docx(os.path.join(file[0], file[1]))
            # print(file[0], file[1])