In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import fitz
import PyPDF2
import jpype

if os.path.abspath('../') not in sys.path:
    sys.path.insert(0, os.path.abspath('../'))

#### Features to use : 
- Index of industrial production (%)
- Production of wood (Thous. m3)
- Poultry population (Mill. heads) (so suc vat trong nuoc suc vat)
- Livestock population (Thous. heads)
- Number of farms
- Production of cereals (Thous. tons)
- ENTERPRISE,COOPERATIVE AND NON-FARM INDIVIDUAL BUSINESS ESTABLISHMENT
- Area of floors of residential buildings constructed in the year by types of house (Thous. m2)
- Investment at current prices (Bill. dongs)
- Structure of investment at current prices (%)
- Expenditure on social and economic services
- STRUCTURE OF GRDP AT CURRENT PRICES (%)
- POPULATION DENSITY (Person/km2)
- POPULATION (Thous. pers.)
- AREA OF LAND (Thous. ha) 

In [2]:
# replace by your installation java path 
econ_input = '../data/socio-economic-data-of-63-provinces-and-centrally-run-cities-2015-2021.pdf'
econ_output = '../data/Vietcon-extracted-pages'
output_csv = '../data/Viecon_data.csv'
features = [
    "Index of industrial production (%)",
    "Production of wood (Thous. m3)",
    "Poultry population (Mill. heads) (so suc vat trong nuoc suc vat)",
    "Livestock population (Thous. heads)",
    "Number of farms",
    "Production of cereals (Thous. tons)",
    "ENTERPRISE, COOPERATIVE AND NON-FARM INDIVIDUAL BUSINESS ESTABLISHMENT",
    "Area of floors of residential buildings constructed in the year by types of house (Thous. m2)",
    "Investment at current prices (Bill. dongs)",
    "Structure of investment at current prices (%)",
    "Expenditure on social and economic services",
    "STRUCTURE OF GRDP AT CURRENT PRICES (%)",
    "POPULATION DENSITY (Person/km2)",
    "POPULATION (Thous. pers.)",
    "AREA OF LAND (Thous. ha)"
]

In [3]:
def extract_pages_with_features(econ_input, features, econ_output):
    econ_raw = fitz.open(econ_input)

    # Loop through each page in the PDF
    for i in range(econ_raw.page_count):
        page = econ_raw[i]
        text = page.get_text("text")

        # Check if any of the features are present in the text
        if any(feature.lower() in text.lower() for feature in features):
            # Save selected text to new text files
            output_folder = os.path.join(econ_output, f"output_page_{i + 1}")
            os.makedirs(output_folder, exist_ok=True)
            with open(os.path.join(output_folder, "output.txt"), "w", encoding="utf-8") as file:
                file.write(text)

    econ_raw.close()
    
    
def extract_rows(econ_output, features):
    # Create a DataFrame to store the extracted data
    data = pd.DataFrame()

    # Loop around text files in the output folder
    for text_file in os.listdir(econ_output):
        if text_file.endswith(".txt"):
            text_path = os.path.join(econ_output, text_file)
            with open(text_path, "r", encoding="utf-8") as file:
                text = file.read()

                # Check for rows containing specified features
                rows = []
                for line in text.split('\n'):
                    if any(feature.lower() in line.lower() for feature in features):
                        rows.append(line)

                if rows:
                    # Append rows to DataFrame
                    rows_df = pd.DataFrame([row.split() for row in rows])
                    data = pd.concat([data, rows_df], ignore_index=True)

    return data


In [5]:
extract_pages_with_features(econ_input, features, econ_output)
extracted_data = extract_rows(econ_output, features)
extracted_data.to_csv(output_csv, index=False)


In [None]:

# def read_large_pdf(file_path):
#     pdf_document = fitz.open(file_path)

#     for page_number in range(pdf_document.page_count):
#         page = pdf_document[page_number]
#         text = page.get_text()
#         # Process the text or perform other operations as needed
#         print(f"Page {page_number + 1}: {text[:100]}...")  # Print the first 100 characters of each page

#     pdf_document.close()

# # open the pdf file
# econ_pdf = "../data/socio-economic-data-of-63-provinces-and-centrally-run-cities-2015-2021.pdf"
# read_large_pdf(econ_pdf)
