# Scan a Microsoft Word document (*.docx) and count the number of characters in each section for use in a treemap
mike babb  
created: 2025 09 06

In [1]:
# standard libraries
import datetime
import os
from pathlib import Path

In [2]:
# external libraries
from docx import Document
import numpy as np
import pandas as pd

# control logic and inputs and outputs

In [3]:
# use the demo data
use_demo = True

In [None]:
if use_demo:
    input_file_name = 'demo_text.docx'
    input_fpn = input_file_name
    output_fpn = 'demo_text.txt'
else:
    # input 
    input_file_path = "YOUR-INPUT-DIRECTORY"
    input_file_name = 'YOUR-INPUT-FILE'
    input_fpn = os.path.join(input_file_path, input_file_name)

    # output
    output_file_path = "YOUR-OUTPUT-DIRECTORY"

    # format a datetime for use in a file name showing incremental changes
    curr_dt = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    output_file_name = 'chapter_data_' + curr_dt + '.txt'
    output_fpn = os.path.join(output_file_path, output_file_name)

# open the document and read all of the paragraphs

In [5]:
document = Document(input_fpn)
paras = document.paragraphs

## scan the document to count the maximum heading depth

In [6]:
max_heading_depth = 0

style_id_set = set()
for para in paras:    
    text_style = para.style
    # the style_id is the name of the style
    style_id = text_style.style_id
    
    if 'Heading' in style_id:            
        style_id_set.add(style_id)
        
style_id_set = sorted(list(style_id_set))
#print(len(style_id_set))
if len(style_id_set) > max_heading_depth:
    max_heading_depth = len(style_id_set) + 1

print(f"{max_heading_depth=}")

max_heading_depth=5


In [7]:
# build columns based on document depth
col_names = ['file_name']
col_names.extend(['h' + str(x) for x in range(0, max_heading_depth )])
col_names.append('char_count')
print(f"{col_names=}")

col_names=['file_name', 'h0', 'h1', 'h2', 'h3', 'h4', 'char_count']


## scan the paragraphs to count the number of characters and tally based on the style in each section

In [8]:
print("Number of paragraphs:", len(paras))

# list to hold the counted output
heading_data_output_list = []

# count characters with these text styles
keep_style_set = {'Normal', 'Caption', 'ListParagraph'}

# ignore text in these headings / sections 
ignore_headings = {'List of Figures', 'List of Tables'}

# the heading depths will occupy the elements of this lists
h_list = [''] * max_heading_depth

# counter for each heading
heading_count = 1

# count (0,1) headings
h0_level_count = 1

# dictionary to hold each uniquely identified heading
heading_dict = {}    

# these variables hold the current and previously identified depths
# necessary for properly recording text depth
curr_depth = 0
prev_depth = 0

# enumerate the paragraphs
for para in paras:    
    
    # the id of the text - the name
    style_id = para.style.style_id
    
    # the text of the paragraph
    curr_text = para.text        
    
    # let's remove the 'list of figures' and the 'list of tables'
    if curr_text not in ignore_headings:    
        
        if style_id[:7] == 'Heading':
            #print(style_id)
            style_level = int(style_id[-1])            

            # every time a heading of (0,1) is encountered, reset the depth lists
            if style_level in (0, 1):
                h_list = [''] * max_heading_depth
                style_level = 1
                
                # this is used to create a numeric depth
                heading_idx_array = np.zeros(shape = (max_heading_depth, ), dtype = int)
                heading_idx_array[0] = h0_level_count
                h0_level_count += 1                

            # strip excess spaces and create an id
            stripped_text = curr_text.strip()
            id_stripped_text = str(heading_count).zfill(2) + ' ' + stripped_text
            
            # this is what is being used to track the depth level via the 1.1.3.4 format           
            # update if previous depth is deeper than the current depth
            prev_depth = curr_depth
            curr_depth = style_level
            
            if prev_depth > curr_depth:
                for jj in range(curr_depth, max_heading_depth):
                    heading_idx_array[jj] = 0
            
            if style_level > 1:
                heading_idx_array[style_level - 1] += 1 
                                                    
            # make a copy of the idx array
            curr_heading_idx_array = heading_idx_array.copy()                                              

            # add the stripped text to the dictionary
            temp_list = [stripped_text, curr_heading_idx_array]                    
            heading_dict[id_stripped_text] = temp_list                
            
            to_include = str(heading_count).zfill(2) + ' ' + stripped_text                        
            
            # reset the text in the style level list
            # this is necessary when the current style level is 2 and the previous
            # style level is 3. Otherwise, the text for style level 3
            # would be incorrectly carried forward
            for sl in range(style_level + 1, max_heading_depth ):
                h_list[sl] = ''
            
            h_list[style_level] = to_include        

            heading_count += 1                                
        
        if style_id in keep_style_set:            
            curr_list = [input_file_name]            
            curr_list.extend(h_list)
            
            # get the number of characters, finally
            curr_text_length = len(para.text)
            curr_list.append(curr_text_length)
            
            heading_data_output_list.append(curr_list)                            

# nested heading depth dataframe
df = pd.DataFrame(data=heading_data_output_list, columns=col_names)

# drop columns that are all '' in the nested heading depth dataframe
drop_col_list = []
for cn in df.columns:
    if np.all(a = df[cn] == ''):
        drop_col_list.append(cn)
df = df.drop(labels=drop_col_list, axis = 1)

# generate a list of column names pertaining to headings
heading_col_names = df.columns.to_list()[1:-1]

# heading depth dataframe
hd_df = pd.DataFrame.from_dict(data = heading_dict, orient='index',
                               columns = ['heading_text', 'heading_level']).reset_index(names = ['id_heading_text'])

Number of paragraphs: 60


In [9]:
hd_df.head()

Unnamed: 0,id_heading_text,heading_text,heading_level
0,01 heading 1,heading 1,"[1, 0, 0, 0, 0]"
1,02 subheading 1.1,subheading 1.1,"[1, 1, 0, 0, 0]"
2,03 subheading 1.2,subheading 1.2,"[1, 2, 0, 0, 0]"
3,04 subheading 1.2.1,subheading 1.2.1,"[1, 2, 1, 0, 0]"
4,05 subheading 1.2.2,subheading 1.2.2,"[1, 2, 2, 0, 0]"


In [10]:
df.head()

Unnamed: 0,file_name,h1,h2,h3,h4,char_count
0,demo_text.docx,01 heading 1,,,,81
1,demo_text.docx,01 heading 1,02 subheading 1.1,,,230
2,demo_text.docx,01 heading 1,03 subheading 1.2,,,330
3,demo_text.docx,01 heading 1,03 subheading 1.2,04 subheading 1.2.1,,59
4,demo_text.docx,01 heading 1,03 subheading 1.2,05 subheading 1.2.2,,379


# Format the data for output

In [11]:
def collapse_numeric_depth(hdl:list) -> str:
    """ Collapse the heading level. From [1,1,0,0] to 1.1.

    Args:
        hdl (list): Heading level. [1,1,0,0]

    Returns:
        str: 1.1
    """
    
    # format the numeric heading depth
    outcome = [str(hd) for hd in hdl.tolist()]
    
    if outcome[0] == '0':
        outcome = '0.0'
    else:
        outcome = [hd for hd in outcome if hd != '0']
        
        if len(outcome) == 1:
            outcome.append('0')
        outcome = '.'.join(outcome)
    return outcome

In [12]:
hd_df['heading_level'] = hd_df['heading_level'].map(collapse_numeric_depth)

In [13]:
# aggregate
agg_col_names = df.columns.tolist()
df_agg = df.groupby(agg_col_names[:-1]).agg(char_count = ('char_count', 'sum')).reset_index()
df_agg.shape

(30, 6)

In [14]:
df_agg.head()

Unnamed: 0,file_name,h1,h2,h3,h4,char_count
0,demo_text.docx,01 heading 1,,,,81
1,demo_text.docx,01 heading 1,02 subheading 1.1,,,230
2,demo_text.docx,01 heading 1,03 subheading 1.2,,,330
3,demo_text.docx,01 heading 1,03 subheading 1.2,04 subheading 1.2.1,,59
4,demo_text.docx,01 heading 1,03 subheading 1.2,05 subheading 1.2.2,,379


In [15]:
hd_df.head()

Unnamed: 0,id_heading_text,heading_text,heading_level
0,01 heading 1,heading 1,1.0
1,02 subheading 1.1,subheading 1.1,1.1
2,03 subheading 1.2,subheading 1.2,1.2
3,04 subheading 1.2.1,subheading 1.2.1,1.2.1
4,05 subheading 1.2.2,subheading 1.2.2,1.2.2


In [16]:
def update_heading_level(ht:str) -> str:
    """ Append the numerical depth to the nested depth headings

    Args:
        ht (str): Heading text

    Returns:
        str: Formatted heading text
    """
    
    curr_text = ht
    
    if ht != '':        
        curr_index = hd_df.loc[hd_df['id_heading_text'] == ht, :]
        if curr_index.shape[0] > 0:        
            curr_text = curr_index['heading_level'].iloc[0] + ' ' + curr_index['heading_text'].iloc[0]              
                    
    return curr_text

In [17]:
# add numeric depth indicator 
for cn in heading_col_names:
     df_agg[cn]  = df_agg[cn].fillna('')
     df_agg[cn] = df_agg[cn].map(update_heading_level)         

In [18]:
df_agg.head()

Unnamed: 0,file_name,h1,h2,h3,h4,char_count
0,demo_text.docx,1.0 heading 1,,,,81
1,demo_text.docx,1.0 heading 1,1.1 subheading 1.1,,,230
2,demo_text.docx,1.0 heading 1,1.2 subheading 1.2,,,330
3,demo_text.docx,1.0 heading 1,1.2 subheading 1.2,1.2.1 subheading 1.2.1,,59
4,demo_text.docx,1.0 heading 1,1.2 subheading 1.2,1.2.2 subheading 1.2.2,,379


In [19]:
# add text to h2 when there is none - this is to account for the initial text at 
# the beginning of each chapter
output_list = []
for i_row, row in df_agg.iterrows():

    h1 = row['h1']
    h2 = row['h2']

    h_value_list = []
    for cn in heading_col_names[1:]:
        h_value_list.append(row[cn]) 
    
    h_value_list = np.array(object=h_value_list)
    if np.all(a = h_value_list == ''):
        h2 = h1

    output_list.append(h2)

In [20]:
df_agg['h2'] = output_list

# save it!

In [21]:
# save to a csv
df_agg.to_csv(path_or_buf=output_fpn, sep='\t', index=False)