In [11]:
from collections import Counter
import os
import PyPDF2
from datetime import datetime
import re
import fitz
wordPerFile = {}

In [2]:
def extract_ticker_and_year(filename):
    # Split the filename into parts
    parts = filename.split('_')
    if len(parts) == 3 and parts[1] == "Annual Report":
        ticker = parts[0]
        year = parts[2]
    else:
        ticker = filename
        year = "-"
    return ticker, year

In [16]:
def count(paths, keywords, output_folder="output"):
    """
    Reads each PDF from the list of paths. If keywords are not found in the text,
    it writes to the output files first before proceeding to the next file.

    :param paths: List of file paths to PDF files.
    :param keywords: List of keywords to look for in the text.
    :param output_folder: Directory to save the output files.
    :return: Word count summary for each file.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    now = datetime.now()
    dateStr = now.strftime("%m-%d %H-%M-%S")
    sentencesFileName = os.path.join(output_folder, f"sentences-{'&'.join(keywords)}-{dateStr}.csv")
    summaryFileName = os.path.join(output_folder, f"summary-{'&'.join(keywords)}-{dateStr}.csv")

    # Write headers to the output files
    with open(sentencesFileName, "w", encoding='utf-8', errors='replace') as sentencesFile:
        sentencesFile.write("Ticker,Year,Sentence\n")
    with open(summaryFileName, "w", encoding='utf-8', errors='replace') as summaryFile:
        summaryFile.write(f"Ticker,Year,{'&'.join(keywords)}\n")

    for path in paths:
        print(f"Processing {path}")
        text = ""
        try:
            # with open(path, "rb") as f:
            #     reader = PyPDF2.PdfReader(f)
            #     for page in reader.pages:
            #         page_text = page.extract_text()
            #         if page_text:
            #             text += page_text + " "
            with fitz.open(path) as doc:
                for page in doc:
                    text += page.get_text()
        except Exception as e:
            text = ""
            print(f"Error processing {path}: {e}")
        print("process done. searching")
        filename = os.path.basename(path)
        ticker, year = extract_ticker_and_year(filename.split(".")[0])

        # Check if any keyword exists in the full text
        if not any(keyword.lower() in text.lower() for keyword in keywords):
            # Write to files and proceed to the next file
            print(f"No keywords found in {path}. Writing empty entries...")
            with open(sentencesFileName, "a", encoding='utf-8', errors='replace') as sentencesFile:
                sentencesFile.write(f"{ticker},{year},\n")
            with open(summaryFileName, "a", encoding='utf-8', errors='replace') as summaryFile:
                summaryFile.write(f"{ticker},{year},0\n")
            wordPerFile[f"{ticker} {year}"] = 0
            continue

        # If keywords are found, process the sentences
        splitSentences = re.split(r'(?<=[.!?])\s+', text)

        # Combine all keywords into a regex pattern
        keyword_pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'

        sentences = [
            sentence for sentence in splitSentences 
            if re.search(keyword_pattern, sentence, re.IGNORECASE)
        ]
        # for sentence in text.replace("\n", " ").split('.'):
        #     for keyword in keywords:
        #         if keyword.lower() in sentence.lower():
        #             sentences.append(sentence.strip())
        #             break

        # Save sentences to file
        with open(sentencesFileName, "a", encoding='utf-8', errors='replace') as sentencesFile:
            if not sentences:
                sentencesFile.write(f"{ticker},{year},\n")
            else:
                for sentence in sentences:
                    sentencesFile.write(f"{ticker},{year},{sentence}\n")

        # Save summary to file
        with open(summaryFileName, "a", encoding='utf-8', errors='replace') as summaryFile:
            summaryFile.write(f"{ticker},{year},{len(sentences)}\n")

        wordPerFile[f"{ticker} {year}"] = len(sentences)

    print(f"Output written to:\n- {sentencesFileName}\n- {summaryFileName}")
    return wordPerFile


In [4]:
# Folder containing the files
folder_path = "D:\Hp\OneDrive\shared\\annual report renamed\\2023"

# List to store PDF file paths
file_paths = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a PDF
    if filename.endswith(".pdf"):
        full_path = os.path.join(folder_path, filename)
        file_paths.append(full_path)
        print(f"Added: {full_path}")  # Print the file path

# Print the final list of files
print("\nList of PDF files included:")
for path in file_paths:
    print(path)


Added: D:\Hp\OneDrive\shared\annual report renamed\2023\AALI_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ABBA_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ABMM_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ACES_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ACSET_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ACST_Annual_Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ADCP_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ADES_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ADHI_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ADMF_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ADMG_Annual Report_2023.pdf
Added: D:\Hp\OneDrive\shared\annual report renamed\2023\ADMR_Annual Report_

In [19]:
# Example usage:
# file_paths = ["example1.pdf", "example2.pdf"]  # Replace with your PDF file paths
keywords = ["direktur utama"]  # Replace with your keywords

In [20]:
# Call the count function directly
result = count(file_paths, keywords)
print("Word Count Summary:", result)

Processing D:\Hp\OneDrive\shared\annual report renamed\2023\AALI_Annual Report_2023.pdf
process done. searching
No keywords found in D:\Hp\OneDrive\shared\annual report renamed\2023\AALI_Annual Report_2023.pdf. Writing empty entries...
Processing D:\Hp\OneDrive\shared\annual report renamed\2023\ABBA_Annual Report_2023.pdf
process done. searching
Processing D:\Hp\OneDrive\shared\annual report renamed\2023\ABMM_Annual Report_2023.pdf
process done. searching
Processing D:\Hp\OneDrive\shared\annual report renamed\2023\ACES_Annual Report_2023.pdf
process done. searching
Processing D:\Hp\OneDrive\shared\annual report renamed\2023\ACSET_Annual Report_2023.pdf
process done. searching
No keywords found in D:\Hp\OneDrive\shared\annual report renamed\2023\ACSET_Annual Report_2023.pdf. Writing empty entries...
Processing D:\Hp\OneDrive\shared\annual report renamed\2023\ACST_Annual_Report_2023.pdf
process done. searching
No keywords found in D:\Hp\OneDrive\shared\annual report renamed\2023\ACST_Ann

KeyboardInterrupt: 