# NLP data pre-processing I : converting .docx files into .txt files

### 1. Rename Word documents to UUID names

In [118]:
# If needed, check installed Python versions (different conda environments have different versions of Python) 
# and executable Python (where your system looks for pip-installed packages).
# import sys
# print(sys.path)
# print(sys.executable)

In [None]:
# Install docx2txt package where your executable Python is. 
# Ex:
#!~/anaconda3/bin/python -m pip install docx2txt

In [140]:
import os
import uuid
from shutil import copyfile
import docx2txt

In [None]:
# To start, your directory has a single "source_files" directory with document folders containing Word files. 

# NOTE: any zipped Word files should be unzipped in the same directory(subdirectory), before proceeding.

In [141]:
def create_directory_if_not_exists(targetDir):
    if not os.path.exists(targetDir):
        os.mkdir(targetDir)
        print("Directory '" + targetDir +  "' created ")
    else:    
        print("Directory '" + targetDir +  "' already exists")
    return os.path.join(os.getcwd(), targetDir)


def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)
                    
                    
def count_files(folder):
    count = 0
    for file in os.listdir(os.path.join(os.getcwd(), folder)):
        count += 1
    return count
    

def check_if_dir_exists(targetDir):
    if not os.path.exists(targetDir):
        print("Directory '" + targetDir +  "' does not exist.")
    else:    
        print("Directory '" + targetDir +  "' exists.")

In [142]:
# 1. Duplicate "source_files" directory

source_dir = os.path.join(os.getcwd(), "source_files")
target_dir = create_directory_if_not_exists("temp1")

copytree(source_dir, target_dir)

Directory 'temp1' created 


In [143]:
# 2. Give docx files UUID names and place them in temp2 directory. 

source_dir = os.path.join(os.getcwd(), "temp1")
target_dir = create_directory_if_not_exists("temp2")

for foldername in os.listdir(source_dir):
    
    # Skip hidden ".DS_Store" files in MacOS
    if foldername[0] != ".":  
        subdirectory = os.path.join(os.getcwd(), "temp1", foldername)
        
        for filename in os.listdir(subdirectory):
            if filename[0] != ".":
                file, extension = os.path.splitext(filename)
                # replace file name with uuid-name 
                unique_filename = str(uuid.uuid4()) + extension
                # rename original file with uuid-name and move into 'temp2' directory
                os.rename(os.path.join(subdirectory,  filename), os.path.join(target_dir, unique_filename))

Directory 'temp2' created 


In [144]:
print(count_files(target_dir))

65


### 2. Create simple text files from Word documents

In [145]:
source_dir = os.path.join(os.getcwd(), "temp2")
target_dir = create_directory_if_not_exists("text_files")

for process_file in  os.listdir(source_dir):
    
    if process_file[0] != ".":
        file, _ = os.path.splitext(process_file)

        # Create a new text file name by concatenating the .txt extension to file UUID
        dest_file = file + '.txt'
        print(dest_file)
        
        #extract text from the file
        content = docx2txt.process(os.path.join(source_dir, process_file))

        write_text_file = open(os.path.join(target_dir, dest_file), "w+")

        #write the content and close the newly created file
        write_text_file.write(content)
        write_text_file.close()

Directory 'text_files' created 
2b6635e7-3123-42ca-ace7-5b9792a9013b.txt
5a589c93-af64-4405-b224-9b945183f2dd.txt
f622cee8-8e29-4d5a-960f-5da77bfd659f.txt
22e8155e-262d-4c10-93fd-52c8d04a18e0.txt
715b8995-770a-46b6-9992-774f5424a218.txt
4aae385b-e09f-43fd-bbb3-46fd67a15c85.txt
0b54026b-385c-410a-87f9-8a4fbf3ba553.txt
d82ee124-4b0a-4930-bc77-60a53e4066df.txt
b53c64b1-820a-44f3-a63a-5a23bda86508.txt
ca0ac6cc-0734-44fb-a023-32bfabf720b3.txt
8f06db39-3a93-4d13-8e31-414d6457968d.txt
e782836d-e99b-4629-a062-4ad9dc0945bb.txt
d94ed1f5-bc17-4731-9770-56064ba54141.txt
22a81926-009c-48ff-bca9-e026d505fc86.txt
b5906e66-47ff-48f2-8684-2207db8cf0d6.txt
673ca2ad-7ef7-405a-9330-7e699d87c2b1.txt
704c732d-4901-4fd1-ad32-fb27d9a08e35.txt
ac7622d4-7ef3-4f7e-878c-1f3edc794af3.txt
e408d310-1ecb-441c-b9f6-b3a9805f0f9f.txt
023cf4ec-6ee4-42a4-bd67-8ff602aff091.txt
fd6c44b0-162c-4451-8d53-5bfaa8c41f1e.txt
7fa67f86-9b80-4d07-8ab9-1b11f62c0f49.txt
66a49077-bd9d-4f82-8023-02502266ba79.txt
0a731899-5e1c-48b0-9dcc-c

In [146]:
print(count_files(target_dir))

65


In [147]:
# 3. Delete temp1 and temp2 directories.
dir1 = os.path.join(os.getcwd(), "temp1")
dir2 = os.path.join(os.getcwd(), "temp2")
shutil.rmtree(dir1)
shutil.rmtree(dir2)

In [148]:
check_if_dir_exists("temp1")
check_if_dir_exists("temp2")

Directory 'temp1' does not exist.
Directory 'temp2' does not exist.
