## Import DOCX file

In [1]:
from docx import Document

In [2]:
document = Document(r"../res/test-swiss.docx")
document

<docx.document.Document at 0x2581b21aea8>

In [3]:
list_table = []
for table in document.tables:
    for row in table.rows:
        list_table.append([cell.text for cell in row.cells])
        
for row in list_table[1:11]:
    print(row)

['PROSPECTS', 'PROSPECTS', 'PROSPECTS', 'PROSPECTS', 'PROSPECTS', 'PROSPECTS']
['GROUP ', 'ACTIVITY', 'CLIENT TYPE', 'INTERACTION MODE', 'OFFSHORE SCENARIO', 'ONSHORE SCENARIO']
['Social Contact \n', 'Social interactions\n', 'Retail', 'Pro-actively', 'YES', 'YES']
['Social Contact \n', 'Social interactions\n', 'Retail', 'Upon RS', 'YES       ', 'YES']
['Social Contact \n', 'Social interactions\n', 'Professionals\n', 'Pro-actively', 'YES', 'YES']
['Social Contact \n', 'Social interactions\n', 'Professionals\n', 'Upon RS', 'YES', 'YES']
['Social Contact \n', 'Provide contact details', 'Retail', 'Pro-actively ', 'YES', 'YES']
['Social Contact \n', 'Provide contact details', 'Retail', 'Upon RS ', 'YES', 'YES']
['Social Contact \n', 'Provide contact details', 'Professionals\n', 'Pro-actively ', 'YES', 'YES']
['Social Contact \n', 'Provide contact details', 'Professionals\n', 'Upon RS ', 'YES', 'YES']


## Make dataframes from tables

In [4]:
import pandas as pd
import io
import csv
from docx import Document

def read_docx_tables(filename, tab_id=None, **kwargs):
    """
    parse table(s) from a Word Document (.docx) into Pandas DataFrame(s)

    Parameters:
        filename:   file name of a Word Document

        tab_id:     parse a single table with the index: [tab_id] (counting from 0).
                    When [None] - return a list of DataFrames (parse all tables)

        kwargs:     arguments to pass to `pd.read_csv()` function

    Return: a single DataFrame if tab_id != None or a list of DataFrames otherwise
    """
    def read_docx_tab(tab, **kwargs):
        vf = io.StringIO()
        writer = csv.writer(vf)
        for row in tab.rows:
            writer.writerow(cell.text for cell in row.cells)
        vf.seek(0)
        return pd.read_csv(vf, **kwargs)

    doc = Document(filename)
    if tab_id is None:
        return [read_docx_tab(tab, **kwargs) for tab in doc.tables]
    else:
        try:
            return read_docx_tab(doc.tables[tab_id], **kwargs)
        except IndexError:
            print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
            raise

In [5]:
dfs = read_docx_tables(r"../res/test-swiss.docx")

In [6]:
dfs[1]

Unnamed: 0,PROSPECTS,PROSPECTS.1,PROSPECTS.2,PROSPECTS.3,PROSPECTS.4,PROSPECTS.5
0,GROUP,ACTIVITY,CLIENT TYPE,INTERACTION MODE,OFFSHORE SCENARIO,ONSHORE SCENARIO
1,Social Contact \n,Social interactions\n,Retail,Pro-actively,YES,YES
2,Social Contact \n,Social interactions\n,Retail,Upon RS,YES,YES
3,Social Contact \n,Social interactions\n,Professionals\n,Pro-actively,YES,YES
4,Social Contact \n,Social interactions\n,Professionals\n,Upon RS,YES,YES
5,Social Contact \n,Provide contact details,Retail,Pro-actively,YES,YES
6,Social Contact \n,Provide contact details,Retail,Upon RS,YES,YES
7,Social Contact \n,Provide contact details,Professionals\n,Pro-actively,YES,YES
8,Social Contact \n,Provide contact details,Professionals\n,Upon RS,YES,YES


### Make single dataframe from tables

In [7]:
import io
import csv
import pandas as pd

from copy import copy
from docx import Document


def table_to_df(tab):
    """
    Parse table from a Word Document (.docx) into Pandas DataFrame
    Parameters:
        tab:   single table object from 'document.tables'
    Return: DataFrame
    """
    
    def read_docx_tab(tab, **kwargs):
        vf = io.StringIO()
        writer = csv.writer(vf)
        for row in tab:
            writer.writerow(row)
        vf.seek(0)
        return pd.read_csv(vf, **kwargs)
    
    CLIENT_SCENARIO_COL = 'CLIENT SCENARIO'
    PRE_CONDITION_COL = 'PRE-CONDITION'
    
    table = [[c.text.strip(' \n') for c in r.cells] for r in tab.rows]
    client_scenario = table.pop(0)[0]
    
    # HANDLING NON STANDARD TABLES

    # Requiers adjustments
    TABLE_NAME_MAX_LEN = 30
    if len(client_scenario) > TABLE_NAME_MAX_LEN:
        pre_condition = client_scenario
        client_scenario = table.pop(0)[0]
        table[0] = [PRE_CONDITION_COL, CLIENT_SCENARIO_COL] + table[0]
        
        for idx, row in enumerate(table[1:], start=1):
            row = [pre_condition, client_scenario] + row
            table[idx] = row
        
    else:

        table[0].insert(0, CLIENT_SCENARIO_COL)

        for row in table[1:]:
            row.insert(0, client_scenario)
    
    # ['OFFSHORE', 'ONSHORE']
    SCENARIO_COLS = list(map(lambda x: x.split()[0], table[0][-2:]))
    
    # update header
    table[0] = table[0][:-2]
    table[0] += ['SCENARIO', 'OUTCOME']
    
    # first scenario
    outcomes = []

    for idx, row in enumerate(table[1:], start=1):
        out = row[-2:]
        outcomes.append(out)
        row = row[:-2] 
        row += [SCENARIO_COLS[0], out[0]]
        table[idx] = row
        
    # second scenario
    for row, out in zip(table[1:], outcomes):
        new_row = copy(row)
        new_row[-2] = SCENARIO_COLS[1]
        new_row[-1] = out[1]

        table.append(new_row)
    
    return read_docx_tab(table)


def read_docx_tables_v2(tables):
    """
    Parameters:
        tables: list of tables to parse 'document.tables'
    Return: DataFrame containing all tables
    """
    tab_dfs = [table_to_df(tab) for tab in tables]
    return pd.concat(tab_dfs)

In [8]:
doc = Document(r"../res/test-swiss.docx")

tab1 = table_to_df(doc.tables[-1])
tab1

Unnamed: 0,CLIENT SCENARIO,GROUP,ACTIVITY,CLIENT TYPE,INTERACTION MODE,SCENARIO,OUTCOME
0,EXISTING CLIENTS,Follow-up administrative tasks,Corporate actions\nAccount closure,Retail,Pro-actively,OFFSHORE,YES
1,EXISTING CLIENTS,Follow-up administrative tasks,Corporate actions\nAccount closure,Retail,Upon RS,OFFSHORE,YES
2,EXISTING CLIENTS,Follow-up administrative tasks,Corporate actions\nAccount closure,Professionals,Pro-actively,OFFSHORE,YES
3,EXISTING CLIENTS,Follow-up administrative tasks,Corporate actions\nAccount closure,Professionals,Upon RS,OFFSHORE,YES
4,EXISTING CLIENTS,Follow-up administrative tasks,"Provide account statements, investment reports...",Retail,Pro-actively,OFFSHORE,YES
5,EXISTING CLIENTS,Follow-up administrative tasks,"Provide account statements, investment reports...",Retail,Upon RS,OFFSHORE,YES
6,EXISTING CLIENTS,Follow-up administrative tasks,"Provide account statements, investment reports...",Professionals,Pro-actively,OFFSHORE,YES
7,EXISTING CLIENTS,Follow-up administrative tasks,"Provide account statements, investment reports...",Professionals,Upon RS,OFFSHORE,YES
8,EXISTING CLIENTS,Follow-up administrative tasks,Corporate actions\nAccount closure,Retail,Pro-actively,ONSHORE,YES
9,EXISTING CLIENTS,Follow-up administrative tasks,Corporate actions\nAccount closure,Retail,Upon RS,ONSHORE,YES


In [9]:
table_to_df(doc.tables[-4]).head()

Unnamed: 0,PRE-CONDITION,CLIENT SCENARIO,GROUP,ACTIVITY,SUBACTIVITY,CLIENT TYPE,INTERACTION MODE,SCENARIO,OUTCOME
0,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,EXISTING CLIENTS,Preparatory administrative activities,Provide forms and contractual documents,Account opening contracts including related forms,Retail,Pro-actively,OFFSHORE,YES
1,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,EXISTING CLIENTS,Preparatory administrative activities,Provide forms and contractual documents,Account opening contracts including related forms,Retail,Upon RS,OFFSHORE,YES
2,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,EXISTING CLIENTS,Preparatory administrative activities,Provide forms and contractual documents,Account opening contracts including related forms,Professionals,Pro-actively,OFFSHORE,YES
3,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,EXISTING CLIENTS,Preparatory administrative activities,Provide forms and contractual documents,Account opening contracts including related forms,Professionals,Upon RS,OFFSHORE,YES
4,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,EXISTING CLIENTS,Preparatory administrative activities,Provide forms and contractual documents,Advisory and discretionary mandates including ...,Retail,Pro-actively,OFFSHORE,YES


In [10]:
final_df = read_docx_tables_v2(doc.tables[1:])
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 560 entries, 0 to 15
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CLIENT SCENARIO    560 non-null    object
 1   GROUP              560 non-null    object
 2   ACTIVITY           560 non-null    object
 3   CLIENT TYPE        560 non-null    object
 4   INTERACTION MODE   560 non-null    object
 5   SCENARIO           560 non-null    object
 6   OUTCOME            560 non-null    object
 7   PRE-CONDITION      128 non-null    object
 8   SUBACTIVITY        128 non-null    object
 9   CONTRACTUAL SETUP  136 non-null    object
dtypes: object(10)
memory usage: 48.1+ KB


In [11]:
final_df.iloc[220:230]

Unnamed: 0,CLIENT SCENARIO,GROUP,ACTIVITY,CLIENT TYPE,INTERACTION MODE,SCENARIO,OUTCOME,PRE-CONDITION,SUBACTIVITY,CONTRACTUAL SETUP
60,EXISTING CLIENTS,Marketing of Specific Services,Wealth services,Retail,Pro-actively,ONSHORE,YES,,,
61,EXISTING CLIENTS,Marketing of Specific Services,Wealth services,Retail,Upon RS,ONSHORE,YES,,,
62,EXISTING CLIENTS,Marketing of Specific Services,Wealth services,Professionals,Pro-actively,ONSHORE,YES,,,
63,EXISTING CLIENTS,Marketing of Specific Services,Wealth services,Professionals,Upon RS,ONSHORE,YES,,,
0,PROSPECTS,Preparatory administrative activities,Provide forms and contractual documents,Retail,Pro-actively,OFFSHORE,NO,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,Account opening contracts including related forms,
1,PROSPECTS,Preparatory administrative activities,Provide forms and contractual documents,Retail,Upon RS,OFFSHORE,YES with a fiduciary company and subject to an...,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,Account opening contracts including related forms,
2,PROSPECTS,Preparatory administrative activities,Provide forms and contractual documents,Professionals,Pro-actively,OFFSHORE,NO,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,Account opening contracts including related forms,
3,PROSPECTS,Preparatory administrative activities,Provide forms and contractual documents,Professionals,Upon RS,OFFSHORE,YES with a fiduciary company and subject to an...,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,Account opening contracts including related forms,
4,PROSPECTS,Preparatory administrative activities,Provide forms and contractual documents,Retail,Pro-actively,OFFSHORE,NO,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,Advisory and discretionary mandates including ...,
5,PROSPECTS,Preparatory administrative activities,Provide forms and contractual documents,Retail,Upon RS,OFFSHORE,YES with a fiduciary company and subject to an...,PRE-CONDITION FOR RELYING ON PREPARATORY ADMIN...,Advisory and discretionary mandates including ...,


In [12]:
# Value saved in 'PRE-CONDITION' column
print(final_df[~final_df['PRE-CONDITION'].isna()].iloc[0]['PRE-CONDITION'])

PRE-CONDITION FOR RELYING ON PREPARATORY ADMINISTRATIVE ACTIVITIES
Any activity below subject to “YES” shall only be conducted:
1) if the condition(s) for the marketing of the relevant service(s) is (are) met OR
2) when the activity is necessary to update the documents and information relating to existing clients


## Assemble in app