In [1]:
import pdfplumber
import csv
import re

In [2]:
input_file_path = "../data/raw_data/vcb_data.pdf"
output_file_path = "../data/processed_data/transactions.csv"

In [3]:
with pdfplumber.open(input_file_path) as pdf:
    num_pages = len(pdf.pages)
    print(f"The PDF file contains {num_pages} pages.")

The PDF file contains 12028 pages.


In [4]:
header = ["date", "credit", "transaction_detail", "transaction_id"]

with open(output_file_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()

In [5]:
def parse_transactions(page_text):
    pattern = r"(\d{2}/\d{2}/\d{4})\s+([\d.,]+)\s+([\s\S]+?)(?=(\d{2}/\d{2}/\d{4})|$)"
    matches = re.finditer(pattern, page_text.strip())
    
    transactions = []
    for match in matches:
        date = match.group(1).strip() 
        credit = match.group(2).strip()  
        transaction_detail = match.group(3).strip()  

        lines = transaction_detail.split("\n")
        if len(lines) > 1:
            transaction_id = lines[1].strip() 
            transaction_detail = f"{lines[0]} {''.join(lines[2:])}".strip()
        else:
            transaction_id = "" 

        transactions.append({
            "date": date,
            "credit": credit,
            "transaction_detail": transaction_detail,
            "transaction_id": transaction_id,
        })
    return transactions

In [6]:
def clean_transaction_detail(transaction):
    keywords = ["Postal address", "Telex", "Page", "Website", "Contact center", "Swift"]
    
    for keyword in keywords:
        if keyword in transaction["transaction_detail"]:
            transaction["transaction_detail"] = transaction["transaction_detail"].split(keyword)[0].strip()
            break  
    
    return transaction

In [7]:
def write_to_csv(transactions):
    with open(output_file_path, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=header)
        for transaction in transactions:
            writer.writerow(transaction)

In [9]:
all_transactions = []

with pdfplumber.open(input_file_path) as pdf:
    for i, page in enumerate(pdf.pages):
        print(f"Processing page {i + 1}")
        text = page.extract_text()   
        transactions = parse_transactions(text)
        cleaned_transactions = [clean_transaction_detail(t) for t in transactions]
        all_transactions.extend(cleaned_transactions)
        
        if i % 100 == 0 or i == num_pages - 1:
            write_to_csv(all_transactions)
            all_transactions = []

print(f"Successfully processed and saved all transactions to the file {output_file_path}")

Processing page 1
Processing page 2
Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Processing page 9
Processing page 10
Processing page 11
Processing page 12
Processing page 13
Processing page 14
Processing page 15
Processing page 16
Processing page 17
Processing page 18
Processing page 19
Processing page 20
Processing page 21
Processing page 22
Processing page 23
Processing page 24
Processing page 25
Processing page 26
Processing page 27
Processing page 28
Processing page 29
Processing page 30
Processing page 31
Processing page 32
Processing page 33
Processing page 34
Processing page 35
Processing page 36
Processing page 37
Processing page 38
Processing page 39
Processing page 40
Processing page 41
Processing page 42
Processing page 43
Processing page 44
Processing page 45
Processing page 46
Processing page 47
Processing page 48
Processing page 49
Processing page 50
Processing page 51
Processing page 52
Processing page 53
Pr