In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-5.3.1-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading lxml-5.3.1-cp311-cp311-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.5/3.8 MB 16.4 MB/s eta 0:00:01
   -------- ------------------------------- 0.8/3.8 MB 4.8 MB/s eta 0:00:01
   ------------------- -------------------- 1.8/3.8 MB 3.1 MB/s eta 0:00:01
   --------------------------- ------------ 2.6/3.8 MB 3.3 MB/s eta 0:00:01
   -------------------------------- ------- 3.1/3.8 MB 3.1 MB/s eta 0:00:01
   -------------------------------------- - 3.7/3.8 MB 3.0 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 3.1 MB/s eta 0:00:00
Installing collected packages: lxml, python-docx
Successfully installed lxml-5.3.1 

In [2]:
from docx import Document
import pandas as pd

In [7]:
# 1. Extract all tables from a Word file
def extract_tables_from_doc(doc_path):
    doc = Document(doc_path)
    tables = doc.tables
    dataframes = []
    
    if tables:
        print(f"Total number of extracted tables: {len(tables)}")
        
        for i, table in enumerate(tables, start=1):
            print(f"\nConverting Table {i} to DataFrame...")

            table_data = []
            for row in table.rows:
                row_data = [cell.text.strip() for cell in row.cells]
                table_data.append(row_data)

            df = pd.DataFrame(table_data[1:], columns=table_data[0])
            dataframes.append(df)

            print(f"DataFrame of Table {i}:")
            print(df)
    else:
        print("No tables found in the file.")

    return dataframes

In [8]:
# 2. Preprocess data
def preprocess_data(df):
    # Remove empty rows/columns
    df.dropna(how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    
    # Normalize text
    for col in df.columns:
        if df[col].dtype == 'object':  # Check the data type of each column
            df[col] = df[col].str.lower()  # Convert to lowercase
            df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
            df[col] = df[col].str.strip()  # Remove extra spaces
    
    return df

In [None]:
# 4. File path for the Word document
doc_path = "Data/00002469135802469136ctcp-bcg-land03022025-000000bo-co-ti-chnh-qu-4-2024-ca-cng-ty-m.docx"

# Extract and convert tables to DataFrames
dataframes = extract_tables_from_doc(doc_path)

In [10]:
dataframes[2]

Unnamed: 0,Items,Code,Note,Closing balance,Opening balance
0,A. SHORT-TERM ASSETS,100,,711271276203,485932979327
1,I. Cash and cash equivalents,110,V.01,6490181446,14178909455
2,1. Cash,111,,6490181446,14178909455
3,II. Short-term investments,120,V.04,4104000000,4560000000
4,I. Trading securities,121,,4560000000,4560000000
5,2. Provisions for devaluation of trading secur...,122,,"(456,000,000)",-
6,III. Short-term receivables,130,,687380288197,462524739791
7,1. Short-term trade receivables,131,V.02,16034000000,4268000000
8,2. Short-term repayments to suppliers,132,V.03,129902906000,109145210000
9,6. Other short-term receivables,136,V.06,"542,843,3 82,197",350511529791


In [11]:
# Function to extract and preprocess data from tables
def process_tables_to_text(doc_path):
    doc = Document(doc_path)
    texts = []

    # Extract paragraphs
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            texts.append(paragraph.text.strip())

    # Extract data from tables and convert to text
    for table in doc.tables:
        for row in table.rows:
            row_text = " | ".join([cell.text.strip() for cell in row.cells if cell.text.strip()])
            if row_text:
                texts.append(f"Table: {row_text}")  # Add prefix to identify table data

    return texts

# File path
doc_path = "Data/00002469135802469136ctcp-bcg-land03022025-000000bo-co-ti-chnh-qu-4-2024-ca-cng-ty-m.docx"

# Extract and convert data
processed_texts = process_tables_to_text(doc_path)


In [12]:
print("\n".join(processed_texts))

CÔNG TY CỔ PHẦN BCG LAND
BCG LAND JOINT STOCK COMPANY
M^
LAND
CỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM
Độc lập - Tự do - Hạnh phúc
THE SOCIALIST REPUBLIC OF VIETNAM
Independence - Freedom - Happiness
TP. Hồ Chí Minh, ngày 01 tháng 24 năm 2025
Ho Chi Minh City, January 24, 2025
SỐ: 04/2025/CBTT-BCGL
CÔNG BỐ THÔNG TIN
DISCLOSURE OF INFORMATION
Kính gửi/To: - Uy ban Chửng khoán Nhà nước/ The State Security Commission
- Sở Giao Dịch Chửng khoán Hà Nội / Hanoi Stock Exchange
Tên tổ chức/ Organization name: Công ty cổ phần BCG LAND/ BCG LAND Joint Stock
Company
Mã chứng khoán/ Securities Symbol: BCR
Địa chỉ trụ sở chính/ Address: 22A Đường số 7, Phường An Phú, Tp. Thủ Đức, Tp. Hồ
Chí Minh, Việt Nam / No 22A, Street 7, An Phu Ward, Thu Due City, Ho Chi Minh City, Vietnam
Điện thoại/ Tel: 028 22216868
Người thực hiện công bố thông tin/ Submitted by: Ông (Mr.) Phạm Đại Nghĩa
Chức vụ/ Position: Thành viên HĐQT kiêm Phó Tổng Giám đốc/ Member of the BODs cum
Deputy CEO
Loại thông tin công bố:	0định kỳ □