In [1]:
import pdfplumber
import csv
import re

In [None]:
input_file_path = "../data/raw_data/vcb_data.pdf"
output_file_path = "../data/processed_data/transactions.csv"

In [3]:
with pdfplumber.open(input_file_path) as pdf:
    num_pages = len(pdf.pages)
    print(f"The PDF file contains {num_pages} pages.")

The PDF file contains 12028 pages.


In [4]:
header = ["date", "credit", "transaction_detail", "transaction_id"]

with open(output_file_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()

In [5]:
def parse_transactions(page_text):
    pattern = r"(\d{2}/\d{2}/\d{4})\s+([\d.,]+)\s+([\s\S]+?)(?=(\d{2}/\d{2}/\d{4})|$)"
    matches = re.finditer(pattern, page_text.strip())
    
    transactions = []
    for match in matches:
        date = match.group(1).strip() 
        credit = match.group(2).strip()  
        transaction_detail = match.group(3).strip()  

        lines = transaction_detail.split("\n")
        if len(lines) > 1:
            transaction_id = lines[1].strip() 
            transaction_detail = f"{lines[0]} {''.join(lines[2:])}".strip()
        else:
            transaction_id = "" 

        transactions.append({
            "date": date,
            "credit": credit,
            "transaction_detail": transaction_detail,
            "transaction_id": transaction_id,
        })
    return transactions

In [6]:
def clean_transaction_detail(transaction):
    keywords = ["Postal address", "Telex", "Page", "Website", "Contact center", "Swift"]
    
    for keyword in keywords:
        if keyword in transaction["transaction_detail"]:
            transaction["transaction_detail"] = transaction["transaction_detail"].split(keyword)[0].strip()
            break  
    
    return transaction

In [7]:
def write_to_csv(transactions):
    with open(output_file_path, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=header)
        for transaction in transactions:
            writer.writerow(transaction)

In [21]:
start_page = 11001
end_page = 13000

all_transactions = []

with pdfplumber.open(input_file_path) as pdf:
    total_pages = len(pdf.pages)
    
    start_page = max(1, start_page)
    end_page = min(total_pages, end_page)  
    
    for i in range(start_page - 1, end_page):
        print(f"Processing page {i + 1}")
        text = pdf.pages[i].extract_text()
        transactions = parse_transactions(text)
        cleaned_transactions = [clean_transaction_detail(t) for t in transactions]
        all_transactions.extend(cleaned_transactions)
        
        write_to_csv(cleaned_transactions)
        all_transactions = []

print(f"Successfully processed and saved all transactions to the file {output_file_path}")

Processing page 11001
Processing page 11002
Processing page 11003
Processing page 11004
Processing page 11005
Processing page 11006
Processing page 11007
Processing page 11008
Processing page 11009
Processing page 11010
Processing page 11011
Processing page 11012
Processing page 11013
Processing page 11014
Processing page 11015
Processing page 11016
Processing page 11017
Processing page 11018
Processing page 11019
Processing page 11020
Processing page 11021
Processing page 11022
Processing page 11023
Processing page 11024
Processing page 11025
Processing page 11026
Processing page 11027
Processing page 11028
Processing page 11029
Processing page 11030
Processing page 11031
Processing page 11032
Processing page 11033
Processing page 11034
Processing page 11035
Processing page 11036
Processing page 11037
Processing page 11038
Processing page 11039
Processing page 11040
Processing page 11041
Processing page 11042
Processing page 11043
Processing page 11044
Processing page 11045
Processing