In [None]:
###Import all necessary packages
from docx import Document
import docx
import os
from docx.document import Document as doctwo
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import pandas as pd
import numpy as np

import time
from datetime import timedelta, datetime

### Load Data

In [None]:
BASE_Source = 'C:\\Users\\User\\OneDrive - Queensland University of Technology\\converted_pdfs\\'
BASE_Distination = 'C:\\Users\\User\\OneDrive - Queensland University of Technology\\converted_pdfs\\DecomposedDocs\\CloudStore\\'

FileList = 'C:\\Users\\User\\OneDrive - Queensland University of Technology\\JupyterPythonQUT\\DoR\\Stage2\\FromPrudence\\Coal or mineral reports metadata_sent.xlsx'

In [None]:
df_fl = pd.read_excel(FileList)
df_fl

In [None]:
df_fl['report_batch'].value_counts()

In [None]:
df_fl = df_fl[df_fl['report_batch']=='CloudStor']
df_fl.reset_index(drop=True)
len(df_fl)

### Define decompositon class and its functionality

In [None]:
class decompose_document:
    
    def __init__(self):
        pass

    # get rels (image id and image name pairs)
    def get_rId_image_name_pairs(self, document):
        rels = {}
        for r in document.part.rels.values():
            if isinstance(r._target, docx.parts.image.ImagePart):
                rels[r.rId] = os.path.basename(r._target.partname)
        return rels

    # check if current block is image
    def check_for_image(self, block, rels):
        for rId in rels:
            if rId in block._p.xml:
                return rels[rId]
        return None
    
    # get image row to add into dataframe
    def add_image_to_row(self, row, file_loc, image_name):
        if 'Image Locations' in row.keys():
            row['Image Locations']+='|||'+os.path.join(file_loc.split('\\')[-1].split('.')[0], image_name)
        else:
            row['Image Locations']=os.path.join(file_loc.split('\\')[-1].split('.')[0], image_name)
        return row
    
    def add_image_desc_to_row(self, row, block):
        if 'Image Descriptions' in row.keys():
            row['Image Descriptions']+='|||'+block.text
        else:
            row['Image Descriptions']=block.text
        return row
    
    def add_text_to_row(self, row, block):
        if 'Texts' in row.keys():
            row['Texts']+='|||'+block.text
        else:
            row['Texts']=block.text
        return row
    
    def add_table_loc_to_row(self, row, file_loc, table_seq):
        if 'Table Locations' in row.keys():
            row['Table Locations']+='|||'+os.path.join(file_loc.split('\\')[-1].split('.')[0],'table{}.csv'.format(table_seq))
        else:
            row['Table Locations']=os.path.join(file_loc.split('\\')[-1].split('.')[0],'table{}.csv'.format(table_seq))
        return row
    
    def add_table_desc_to_row(self, row, block):
        if 'Table Descriptions' in row.keys():
            row['Table Descriptions']+='|||'+block.text
        else:
            row['Table Descriptions']=block.text
        return row

    ##This function extracts the tables and paragraphs from the document object
    def iter_block_items(self, parent):
        """
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.
        """
        if isinstance(parent, doctwo):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)

    def fill_columns(self, df_adoc, doc_loc):
        if not ('Doc Location' in df_adoc.columns):
            df_adoc['Doc Location'] = [doc_loc for idx in df_adoc.index]
        if not ('Heading Seq' in df_adoc.columns):
            df_adoc['Heading Seq'] = [np.nan for idx in df_adoc.index]
        if not ('Heading' in df_adoc.columns):
            df_adoc['Heading'] = [np.nan for idx in df_adoc.index]
        if not ('Texts' in df_adoc.columns):
            df_adoc['Texts'] = [np.nan for idx in df_adoc.index]
        if not ('Image Descriptions' in df_adoc.columns):
            df_adoc['Image Descriptions'] = [np.nan for idx in df_adoc.index]
        if not ('Image Locations' in df_adoc.columns):
            df_adoc['Image Locations'] = [np.nan for idx in df_adoc.index]
        if not ('Table Locations' in df_adoc.columns):
            df_adoc['Table Locations'] = [np.nan for idx in df_adoc.index]
        if not ('Table Descriptions' in df_adoc.columns):
            df_adoc['Table Descriptions'] = [np.nan for idx in df_adoc.index]
        
        df_adoc = df_adoc[['Doc Location', 'Heading Seq', 'Heading', 'Texts', 'Image Descriptions', 'Image Locations', 'Table Descriptions', 'Table Locations']]
        
        return df_adoc
                
    # This function splits each data block in a docx and tags them for decomposition
    def decompose_docx(self, base, file_loc):
        document = Document(base+file_loc)
        rels = self.get_rId_image_name_pairs(document)
        heading_seq = 0
        image_active = False
        table_active = False
        table_seq = 0
        rows = []
        row = {'Heading Seq':heading_seq}
        for block in self.iter_block_items(document):
            
            if 'text' in str(block): # the block is text (heading of paragraph)
                image_name = self.check_for_image(block, rels)
                if image_name: # the block is image
                    print('-[Image]', os.path.join(file_loc.split('\\')[-1].split('.')[0], image_name))
                    image_active = True
                    row = self.add_image_to_row(row, file_loc, image_name)
                elif block.style.name.startswith('Heading'):
                    print('[Heading {}]'.format(heading_seq), block.text)
                    if 'Heading' in row.keys():
                        rows.append(row)
                        heading_seq+=1
                        row = {'Heading Seq':heading_seq}
                        row['Heading']=block.text
                    else:
                        row['Heading']=block.text
                        heading_seq+=1
                elif block.text!='':
                    print('-', block.text)
                    if image_active:
                        #print('-', block.text)
                        row = self.add_image_desc_to_row(row, block)
                        image_active = False
                    if table_active:
                        #print('-', block.text)
                        row = self.add_table_desc_to_row(row, block)
                        table_active = False
                    else:
                        #print('-', block.text)
                        row = self.add_text_to_row(row, block)
            else: # the block is table
                print('-[Table]')
                table_active = True
                row = self.add_table_loc_to_row(row, file_loc, table_seq)
                table_seq+=1
        rows.append(row)
        df_adoc = pd.DataFrame.from_records(rows)
        df_adoc = self.fill_columns(df_adoc, file_loc)
        
        return df_adoc


### Do decomposition on files

In [None]:
def check_exist(drive_path, file_name):
    drive_items = os.listdir(drive_path)
    return file_name in drive_items

def write_elapsed_time(drive_path, elapsed_time, task_name):
    string = '===================================\n'
    string += 'Task Name: '+task_name+'\n'
    string += 'Elapsed Time: '+elapsed_time+'\n'
    string += 'Date time at the end of task: {}\n'.format(datetime.today())
    with open(drive_path+'elapsed_times.txt', 'a+') as FO:
        FO.write(string)

In [None]:
range_end = 0
for i in range((int(len(df_fl)/100))):
    range_end = (i+1)*100
    if check_exist(BASE_Distination, str(range_end-100+1)+'_'+str(range_end)+'_decom.csv'):
        print('File {}_decom.csv Exists'.format(str(range_end-100+1)+'_'+str(range_end)))
    else:
        break
print('---Current file~'+str(range_end-100+1)+'_'+str(range_end)+'_decom.csv')

dc = decompose_document()
count = 0
for idx in df_fl.index:
    count+=1
    if count < range_end-100+1:
        continue
    
    start_time = time.time()
    print("Date time: {} at the start of file [{}]".format(datetime.today(), range_end))

    file_loc = df_fl.loc[idx, 'Datagroup']+'\\'+str(int(df_fl.loc[idx, 'batch']))+'\\'+df_fl.loc[idx, 'C'].split('.')[0]+'.docx'
    df_adoc = dc.decompose_docx(BASE_Source, file_loc)
    #df_adoc = dc.decompose_docx('C:\\Users\\User\\Downloads\\OneDrive_2022-12-13\\gsq identified reports\\', 'cr085525_cr_85525_1.docx')
    #df_adoc = dc.decompose_docx('C:\\Users\\User\\Downloads\\OneDrive_2022-12-13\\gsq identified reports\\', 'cr113547_cr_113547_1.docx')
    # todo
    break

    if count%100==0:
        elapsed_time = str(timedelta(seconds=time.time() - start_time))
        print('Elapsed Time: {}'.format(elapsed_time))
        write_elapsed_time(BASE_Distination, elapsed_time, '{}_out.csv'.format(start_time))
        

In [None]:
df_adoc