# Transforming all doc files in the 3GPP download to the docx format

The files can be in the .zip file or anywhere else

In [None]:
# first, let's collect all the .zip and .doc and .rtf files
str3GPPFolder = '/mnt/d/Specs'

In [None]:
# first, we need to get the list of all .doc and .zip files in the 3GPP folder and all of its subfolders
import os

# this function will return a list of all .doc and .zip files in the 3GPP folder and all of its subfolders
def get_files(strFolder):
    files2 = []
    for root, dirs, files in os.walk(strFolder):
        for file in files:
            if file.endswith(".doc"):
                files2.append([root, file])
    return files2

In [None]:
import shutil

# this function will return a list of all .doc and .zip files in the 3GPP folder and all of its subfolders
def move_files(strFolder, strDestFolder):
    files2 = []
    for root, dirs, files in os.walk(strFolder):
        for file in files:
            if file.endswith(".docx"):
                # copy this file to the destination folder
                strSourceFile = os.path.join(root, file)
                strDestFile = os.path.join(strDestFolder, file)
                print('Copying ' + strSourceFile + ' to ' + strDestFile)
                shutil.copy2(strSourceFile, strDestFile)

In [None]:
# for the conversion, we use the uno framework from the libreoffice package
# it requires additional installation of the unoconv package
# to install it, run the following command in the terminal:
# sudo apt-get install unoconv

import subprocess

def convert_doc_to_docx(doc_file, new_folder):
    # Use unoconv to convert .doc to .docx
    subprocess.run(["unoconv", "-f", "docx", doc_file], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


In [None]:
files = get_files(str3GPPFolder)

print(len(files))

In [None]:
# save that list to a file
with open('./3GPP_files.txt', 'w') as f:
    for item in files:
        f.write("%s\n" % item)

In [None]:
# then, let's convert them

total = len(files)
iCounter = 0

# if the file is a .doc file, convert it to .docx
for file in files:
    if file[1].endswith(".doc") or file[1].endswith(".rtf"):
        # check if the same file with .docx exists
        if not os.path.exists(os.path.join(file[0], file[1] + 'x')):
            convert_doc_to_docx(os.path.join(file[0], file[1]), './3gpp_downloaded')
        iCounter += 1

    # print status every 100 files
    if iCounter % 100 == 0:
        print(f'{iCounter} of {total} files processed')


In [19]:
move_files(str3GPPFolder, './3gpp_downloaded')

Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21101-a40/21101-a40.docx to ./3gpp_downloaded/21101-a40.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21101-a40/21101_CR0074_(Rel-10).docx to ./3gpp_downloaded/21101_CR0074_(Rel-10).docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21111-a00/21111-a00.docx to ./3gpp_downloaded/21111-a00.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21201-a40/21201-a40.docx to ./3gpp_downloaded/21201-a40.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21202-a31/21202-a31.docx to ./3gpp_downloaded/21202-a31.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21801-a21/21801-a21.docx to ./3gpp_downloaded/21801-a21.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21900-a10/21900-a10.docx to ./3gpp_downloaded/21900-a10.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21902-a00/21902-a00.docx to ./3gpp_downloaded/21902-a00.docx
Copying /mnt/d/Specs/2023-12/Rel-10/21_series/21905-a30/21905-a30.docx to ./3gpp_downloaded/21905-a30.docx
Copying /mnt/

In [18]:
# now, if a file is a .zip file, then we need to extract it to a folder with the same name as the .zip file
# and then convert all .doc files in the folder to .docx files
# we can use the zipfile module to extract the .zip file
# and then use the os module to convert the .doc files to .docx files
import zipfile

def extract_zip(zip_file):
    # extract the .zip file to a folder with the same name as the .zip file
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(os.path.splitext(zip_file)[0])

    # get all .doc files in the folder
    files = get_files(os.path.splitext(zip_file)[0])

    # convert all .doc files to .docx files
    for file in files:
        if file[1].endswith(".doc") or file[1].endswith(".rtf"):
            convert_doc_to_docx(os.path.join(file[0], file[1]))
            # print(file[0], file[1])