# PDF Token Estimation

In [1]:
import os
import fitz 
# use pymuPDF-1.24.10
from tqdm import tqdm

In [2]:
def count_tokens_in_pdf(pdf_path):
    """Count tokens in the given PDF file where 4 characters equal 1 token."""
    try:
        pdf_document = fitz.open(pdf_path)
        total_characters = 0
        
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text()
            total_characters += len(text) 
        
        pdf_document.close()
        
        tokens = total_characters / 4
        return total_characters, tokens
    except Exception as e:
        raise e

def count_tokens_in_folder(folder_path):
    total_tokens = 0
    error_count = 0
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    print(f'Found {len(pdf_files)} PDF files in {folder_path}')

    for pdf_file in tqdm(pdf_files, desc="Processing PDFs", leave=True):
        file_path = os.path.join(folder_path, pdf_file)
        try:
            _, token_count = count_tokens_in_pdf(file_path)
            total_tokens += token_count
        except Exception:
            error_count += 1

    print(f"Total tokens in {folder_path.split('/')[-1]}:\t{total_tokens:.2f}")
    print(f"Total errors encountered: {error_count}")

    return total_tokens

In [4]:
base_path = '/Users/muhammad.galih/Documents/web-scrape/data/wip/28-bldk-ma/warc-output'
category = os.listdir(base_path)
# category.remove('.DS_Store')
print(category)

['id_sekretariat-id_dok-keg-sekretariat-id_1381-balitbangdiklatkumdil-mari-mengadakan-penyembelihan-hewan-qurban.warc', 'id_pusdiklat-menpim-id_dok-keg-menpim-id_1834-pembelajaran-agenda-iii-pelatihan-dasar-cpns-gel-i.warc', 'id_pustrajak-id_dok-keg-pustrajak-id_625-pengelolaan-jurnal-penelitian-pada-puslitbang-hukum-dan-peradilan.warc', 'id_pustrajak-id_dok-keg-pustrajak-id_1444-efektifitas-rehabilitasi-sebagai-pemidanaan-terhadap-penyalah-guna-narkotika.warc', 'id_pusdiklat-menpim-id_dok-keg-menpim-id_1957-calon-pns-tidak-sempat-mengikuti-pelatihan.warc', 'id_pusdiklat-menpim-id_dok-keg-menpim-id_2079-rapat-persiapan-penyelenggaraan-pelatihan-tahun-2022-pada-pusdiklat-manajemen-dan-kepemimpinan-bldk-ma-ri.warc', 'id_pusdiklat-menpim-id_dok-keg-menpim-id_571-penutupan-diklat-sertifikasi-pengadaan-barang-dan-jasa.warc', 'id_pusdiklat-teknis-id_dok-keg-teknis-id_1388-pendalaman-materi-tindak-pidana-pemilu-wilayah-hukum-pengadilan-jawa-tengah.warc', 'id_pusdiklat-menpim-id_dok-keg-menpim

In [4]:
totals = []
num_files = []
size = []
for cat in category:
    cat_path = os.path.join(base_path, cat)
    total_tokens = count_tokens_in_folder(cat_path)
    totals.append(total_tokens)
    num_files.append(len([f for f in os.listdir(cat_path) if f.endswith('.pdf')]))
    size.append(sum([os.path.getsize(os.path.join(cat_path, f)) for f in os.listdir(cat_path) if f.endswith('.pdf')]))
    print(f"Total tokens in {cat}:\t{total_tokens:.2f}")
    print('=====================================================================================================')

Found 75 PDF files in /Users/muhammad.galih/Documents/web-scrape/data/wip/25-pmk/pdf-output/manual


Processing PDFs: 100%|██████████| 75/75 [00:05<00:00, 13.37it/s]


Total tokens in manual:	1011725.50
Total errors encountered: 0
Total tokens in manual:	1011725.50
Found 299 PDF files in /Users/muhammad.galih/Documents/web-scrape/data/wip/25-pmk/pdf-output/pengumuman


Processing PDFs: 100%|██████████| 299/299 [00:08<00:00, 35.01it/s]

Total tokens in pengumuman:	1394570.25
Total errors encountered: 1
Total tokens in pengumuman:	1394570.25





In [5]:
import csv

with open(os.path.join(base_path, 'total_tokens.csv'), mode='w') as file:
    writer = csv.writer(file)
    writer.writerow(['Category', 'Number of Files', 'Total Tokens', 'Size'])
    for i, cat in enumerate(category):
        writer.writerow([cat, num_files[i], totals[i], size[i]])

# WARC Token Estimation

In [6]:
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
from langdetect import detect
import re
import os
import time
import csv

def count_tokens(text):
    cleaned_text = re.sub(r'\s+', '', text)
    token_count = len(cleaned_text) // 4
    return token_count

def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    return text

warc_folder = r'/Users/muhammad.galih/Documents/web-scrape/data/wip/28-bldk-ma/warc-output'

total_token_count = 0
start_time = time.time()

warc_files = [file for file in os.listdir(warc_folder) if file.endswith('.warc')]
print(f'Found {len(warc_files)} WARC files in the folder.')

for processed_files, warc_file in enumerate(warc_files, start=1):
    warc_file_path = os.path.join(warc_folder, warc_file)

    with open(warc_file_path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                content = record.content_stream().read().decode('utf-8', errors='ignore')

                text_content = extract_text_from_html(content)

                try:
                    language = detect(text_content)
                except:
                    language = 'unknown'

                if language == 'id':
                    token_count = count_tokens(text_content)
                    total_token_count += token_count

    print(f'Processed {processed_files}/{len(warc_files)} files. Token count in {warc_file}: {token_count}')

print(f'\nTotal token count for all Indonesian content: {total_token_count}')
print(f'Total processing time: {time.time() - start_time:.2f} seconds.')

Found 1266 WARC files in the folder.
Processed 1/1266 files. Token count in id_sekretariat-id_dok-keg-sekretariat-id_1381-balitbangdiklatkumdil-mari-mengadakan-penyembelihan-hewan-qurban.warc: 940
Processed 2/1266 files. Token count in id_pusdiklat-menpim-id_dok-keg-menpim-id_1834-pembelajaran-agenda-iii-pelatihan-dasar-cpns-gel-i.warc: 742
Processed 3/1266 files. Token count in id_pustrajak-id_dok-keg-pustrajak-id_625-pengelolaan-jurnal-penelitian-pada-puslitbang-hukum-dan-peradilan.warc: 892
Processed 4/1266 files. Token count in id_pustrajak-id_dok-keg-pustrajak-id_1444-efektifitas-rehabilitasi-sebagai-pemidanaan-terhadap-penyalah-guna-narkotika.warc: 1772
Processed 5/1266 files. Token count in id_pusdiklat-menpim-id_dok-keg-menpim-id_1957-calon-pns-tidak-sempat-mengikuti-pelatihan.warc: 694
Processed 6/1266 files. Token count in id_pusdiklat-menpim-id_dok-keg-menpim-id_2079-rapat-persiapan-penyelenggaraan-pelatihan-tahun-2022-pada-pusdiklat-manajemen-dan-kepemimpinan-bldk-ma-ri.war

In [8]:
warc_size = os.path.getsize(warc_folder)
n_warc_files = len(warc_files)
print(f'Total size of WARC files: {warc_size} bytes')
print(f'Number of WARC files: {n_warc_files}')

Total size of WARC files: 40576 bytes
Number of WARC files: 1266


In [None]:
import csv

with open(os.path.join(warc_folder, 'total_tokens.csv'), mode='w') as file:
    writer = csv.writer(file)
    writer.writerow(['Category', 'Number of Files', 'Total Tokens', 'Size'])
    writer.writerow([cat, num_files[i], totals[i], size[i]])