In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import fitz
# import PyPDF2
# import jpype
# import pdfplumber
# from openpyxl import Workbook

if os.path.abspath('../') not in sys.path:
    sys.path.insert(0, os.path.abspath('../'))

#### Features to use : 
- Index of industrial production (%)
- Production of wood (Thous. m3)
- Poultry population (Mill. heads) (so suc vat trong nuoc suc vat)
- Livestock population (Thous. heads)
- Number of farms
- Production of cereals (Thous. tons)
- ENTERPRISE,COOPERATIVE AND NON-FARM INDIVIDUAL BUSINESS ESTABLISHMENT
- Area of floors of residential buildings constructed in the year by types of house (Thous. m2)
- Investment at current prices (Bill. dongs)
- Structure of investment at current prices (%)
- Expenditure on social and economic services
- STRUCTURE OF GRDP AT CURRENT PRICES (%)
- POPULATION DENSITY (Person/km2)
- POPULATION (Thous. pers.)
- AREA OF LAND (Thous. ha) 

In [2]:
# replace by your installation java path 
econ_input = '../data/socio-economic-data-of-63-provinces-and-centrally-run-cities-2015-2021.pdf'
econ_output = '../data/Vietcon-extracted-feature'
econ_output_2 = '../data/Vietcon-extracted-feature'
output_csv = '../data/Viecon_data.csv'
features = [
    "Index of industrial production (%)",
    "Production of wood (Thous. m3)",
    "Poultry population (Mill. heads)",
    "Livestock population (Thous. heads)",
    "Number of farms",
    "Production of cereals (Thous. tons)",
    "Area of floors of residential buildings constructed in the year by types of house (Thous. m2)",
    "Investment at current prices (Bill. dongs)",
    "Structure of investment at current prices (%)",
    "Expenditure on social and economic services",
    "STRUCTURE OF GRDP AT CURRENT PRICES (%)",
    "POPULATION DENSITY (Person/km2)",
    "POPULATION (Thous. pers.)",
    "AREA OF LAND (Thous. ha)"
]

provinces = [
    'An Giang', 'Ba Ria-Vung Tau', 'Bac Lieu', 'Bac Kan', 'Bac Giang',
    'Bac Ninh', 'Ben Tre', 'Binh Duong', 'Binh Dinh', 'Binh Phuoc',
    'Binh Thuan', 'Ca Mau', 'Can Tho', 'Cao Bang', 'Da Nang', 'Dak Lak',
    'Dak Nong', 'Dien Bien', 'Dong Nai', 'Dong Thap', 'Gia Lai', 'Ha Giang',
    'Ha Nam', 'Ha Noi', 'Ha Tinh', 'Hai Duong', 'Hai Phong', 'Hau Giang',
    'Hoa Binh', 'Hung Yen', 'Khanh Hoa', 'Kien Giang', 'Kon Tum', 'Lai Chau',
    'Lam Dong', 'Lang Son', 'Lao Cai', 'Long An', 'Nam Dinh', 'Nghe An',
    'Ninh Binh', 'Ninh Thuan', 'Phu Tho', 'Phu Yen', 'Quang Binh', 'Quang Nam',
    'Quang Ngai', 'Quang Ninh', 'Quang Tri', 'Soc Trang', 'Son La', 'Tay Ninh',
    'Thai Binh', 'Thai Nguyen', 'Thanh Hoa', 'Thua Thien - Hue', 'Tien Giang',
    'Tra Vinh', 'Tuyen Quang', 'Vinh Long', 'Vinh Phuc', 'Yen Bai'
]


In [3]:
def extract_pages_by_features_and_provinces(input_folder, features, provinces, output_folder):
    pdf_document = fitz.open(input_folder)
    
    # Create dictionaries to store the count for each feature and each province
    feature_count = {feature: 0 for feature in features}
    province_count = {province: 0 for province in provinces}

    for feature in features:
        for province in provinces:
            selected_pages = []

            # Iterate through each page in the PDF document
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text = page.get_text("text")

                # Check if both the feature and province names are present in the text
                if feature.lower() in text.lower() and province.lower() in text.lower():
                    selected_pages.append((page_num + 1, text))
                    feature_count[feature] += 1
                    province_count[province] += 1

            # Save selected pages to new PDF files
            for page_num, text in selected_pages:
                output_folder_feature = os.path.join(output_folder, feature, province)
                os.makedirs(output_folder_feature, exist_ok=True)
                output_path = os.path.join(output_folder_feature, f"output_page_{page_num}.pdf")

                # Create a new PDF document
                new_pdf_document = fitz.open()
                new_pdf_document.insert_pdf(pdf_document, from_page=page_num - 1, to_page=page_num)
                new_pdf_document.save(output_path)
                new_pdf_document.close()

    pdf_document.close()

    # Print the count for each feature and each province
    for feature, count in feature_count.items():
        print(f"Number of files saved for {feature}: {count}")

    for province, count in province_count.items():
        print(f"Number of files saved for {province}: {count}")
extract_pages_by_features_and_provinces(econ_input, features, provinces, econ_output_2)


Number of files saved for Index of industrial production (%): 62
Number of files saved for Production of wood (Thous. m3): 62
Number of files saved for Poultry population (Mill. heads): 20
Number of files saved for Livestock population (Thous. heads): 61
Number of files saved for Number of farms: 62
Number of files saved for Production of cereals (Thous. tons): 62
Number of files saved for ENTERPRISE, COOPERATIVE AND NON-FARM INDIVIDUAL BUSINESS ESTABLISHMENT: 0
Number of files saved for Area of floors of residential buildings constructed in the year by types of house (Thous. m2): 0
Number of files saved for Investment at current prices (Bill. dongs): 62
Number of files saved for Structure of investment at current prices (%): 62
Number of files saved for Expenditure on social and economic services: 3
Number of files saved for STRUCTURE OF GRDP AT CURRENT PRICES (%): 62
Number of files saved for POPULATION DENSITY (Person/km2): 62
Number of files saved for POPULATION (Thous. pers.): 62


### selected features : 
- Number of files saved for Index of industrial production (%): 62
- Number of files saved for Production of wood (Thous. m3): 62
- Number of files saved for Livestock population (Thous. heads): 61
- Number of files saved for Number of farms: 62
- Number of files saved for Production of cereals (Thous. tons): 62
Number of files saved for Investment at current prices (Bill. dongs): 62
Number of files saved for Structure of investment at current prices (%): 62
Number of files saved for STRUCTURE OF GRDP AT CURRENT PRICES (%): 62
Number of files saved for POPULATION DENSITY (Person/km2): 62
Number of files saved for POPULATION (Thous. pers.): 62
Number of files saved for AREA OF LAND (Thous. ha): 62
