In [None]:
!pip3 install openai

In [None]:
import pandas as pd
from openai import OpenAI

# 1. Text Extraction

#### Using PyMuPDF

In [None]:
import fitz  # Import the PyMuPDF library

def extract_text_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF file
    text = ""  # Initialize an empty string to store text

    for page in doc:  # Iterate through each page
        text += page.get_text()  # Extract text from the page and append it

    doc.close()  # Close the document
    return text

# Example usage
pdf_path = '/Users/juliamarkusiewicz/Downloads/PDF_Mining_Julia/Jaarrekening-Momo-Medical-Holding-B.V.-2022.pdf'
text = extract_text_pymupdf(pdf_path)
text_data = extract_text_pymupdf(pdf_path)
print(text)

#### Using  PyPDF2

In [None]:
import PyPDF2

def extract_text_pypdf2(pdf_path):
    text = ""  # Initialize an empty string to store text
    with open(pdf_path, 'rb') as file:  # Open the PDF file in binary mode
        reader = PyPDF2.PdfReader(file)  # Create a PDF reader object

        for page in reader.pages:  # Iterate through each page
            text += page.extract_text() + "\n"  # Extract text and append it

    return text

# Example usage
pdf_path = '/Users/juliamarkusiewicz/Downloads/PDF_Mining_Julia/Jaarrekening-Momo-Medical-Holding-B.V.-2022.pdf'
text = extract_text_pypdf2(pdf_path)
print(text)

### Table Extraction

In [None]:
from tabula import read_pdf

def extract_tables_tabula(pdf_path):
    # This function returns a list of DataFrames, one for each page
    dfs = read_pdf(pdf_path, pages='all', multiple_tables=True)

    for i, df in enumerate(dfs):
        print(f"Table {i+1}:")
        print(df)  # Print each table
        # You can also save the DataFrame to a CSV or Excel
        # df.to_csv(f'table_{i+1}.csv') # Uncomment to save to CSV

# Example usage
pdf_path = '/Users/juliamarkusiewicz/Downloads/PDF_Mining_Julia/Jaarrekening-Momo-Medical-Holding-B.V.-2022.pdf'
extract_tables_tabula(pdf_path)

## Extracting and Categorising data 

In [None]:
MAX_DESCRIPTION_LENGTH = 120

# is_number returns a number stripped from commas, dots and parentheses and a boolean indicating if the input is a number
# or the raw data if `s` is not a number
def is_number(s: str) -> [int, bool]:
    if s.startswith("(") and s.endswith(")"):
        s = s[1:-1]
    elif s == "-":
        return 0, True
    # NOTE: more cases here
    num = s.replace(",", "").replace(".", "")
    return num, num.isdigit()

def is_page_number(s: str) -> bool:
    return s.strip().startswith("Page ")

def parse_adjusted_financial_text(text):
    # Split the text into lines
    lines = text.strip().split("\n")
    
    # Initialize an empty list to store our parsed data
    parsed_data = []
    
    # this variable tracks if we found any numeric data on the current page
    no_numeric_data_on_page = True
    misc_text_data = []
    
    # Initialize empty strings for current description and values
    current_description = ""
    value_2022 = ""
    
    # Iterate over each line in the text
    for line in lines:
        # Check if line is a description or a value
        maybe_num, is_num = is_number(line)
        if is_num:
            # If it's a digit, it's a value
            if value_2022 == "":
                # If the first value is not yet set, set it as 2022 value
                value_2022 = maybe_num
            else:
                print([current_description.strip().lower(), int(value_2022), int(maybe_num)])
                parsed_data.append([current_description.strip().lower(), int(value_2022), int(maybe_num)])
                # Reset the values for the next set of data
                current_description = ""
                value_2022 = ""
                no_numeric_data_on_page = False
        else:
            # we might find long streches of text unrelated to data labeling
            # in that case we have the following solutions:
            #  * cap descripiton length (x chars)
            #  * 
            # If it's not a digit, it's a description
            # Accumulate descriptions until we reach a digit
            if current_description:
                if is_page_number(line):
                    if no_numeric_data_on_page:
                        misc_text_data.append(current_description)
                        current_description = ""
                    # we want to reset the tracker at every new page
                    no_numeric_data_on_page = True
                else: current_description += " " + line.strip()
                # if current_description.len() > 100 cut the first characters until it's len is 100
                #if len(current_description) > MAX_DESCRIPTION_LENGTH:
                #    to_cut = len(current_description) - MAX_DESCRIPTION_LENGTH
                #    current_description = current_description[to_cut:]
            else:
                current_description = line
                
    # Convert the parsed data into a DataFrame
    return pd.DataFrame(parsed_data, columns=['Description', '2022 EUR', '2021 EUR']), misc_text_data

In [None]:
# Parse the adjusted text data
df_parsed, misc_text_data = parse_adjusted_financial_text(text_data)

# Display the parsed DataFrame
display(df_parsed)

In [None]:
# extracting company information
 
client = OpenAI(base_url = 'http://localhost:11434/v1',api_key='ollama')

def prepare_prompt(text: str) -> list:
    return [
        # prompt not final, to be adjusted
        {"role": "system", "content": "Please extract the name of the company, the two years that the financial statement is about (i.e 2021, 2022), in the order they appear, the currency, and the type of financial statemen. Output only those values, separated by commas."},
        {"role": "user", "content": text},
        {"role": "assistant", "content": "category name:"}
    ]

def output_company_information(text: str) -> str:
    messages = prepare_prompt(text)
    response = client.chat.completions.create(
        # TODO: try on smaller models
        model="gemma:7b",
        messages = messages
    )
    company_information = response.choices[0].message.content
    # TODO: other text manipulation like converting to lowercase etc.
    return company_information

# from the first 500 characters of df_parsed, extract company name, year, currency, and the type of financial statement
company_information = output_company_information(text[:500])
print(company_information)


In [None]:
# now replace all outputs longer than XXX characters with a category name extracted by gemma

def prepare_messages(text: str) -> list:
    return [
        # prompt not final, to be adjusted
        {"role": "system", "content": "This is a fragment of description from a financial statement. Please extract a category name for financial analysis from the following paragraph, output only the category name and no other information:"},
        {"role": "user", "content": text},
        {"role": "assistant", "content": "category name:"}
    ]

def output_category_name(text: str) -> str:
    messages = prepare_messages(text)
    response = client.chat.completions.create(
        # TODO: try on smaller models
        model="gemma:7b",
        messages = messages
    )
    category_name = response.choices[0].message.content
    # TODO: other text manipulation like converting to lowercase etc.
    return category_name


In [None]:
#a = output_category_name("current receivables from other legal entities and companies with a participating interest in the legal entity or from participating interests of the legal entity")
#print(a)
i = 0
for row in df_parsed['Description']:
    if len(row) > MAX_DESCRIPTION_LENGTH:
        category_name = output_category_name(row)
        # TODO: overwrite the 'Description' column with the new category name
        print(row, "->", category_name)
        i += 1
    if i > 5:
        break


# 2. Data Cleaning , Classifying 

### Classifying Data 


In [None]:
def classify_table(df):
    # Example of simple keyword-based classification
    if 'Total of inventories' in df.columns:
        return 'Inventory'
    elif 'Total of non-current assets' in df.columns:
        return 'Non-Current Assets'


    else:
        return 'Unknown'

# 3. Data Validation


In [None]:
#Consistency check ...

# Balance sheet validation rule: Assets = Liabilities + Equity
if total_assets == total_liabilities + equity:
    print("The balance sheet balances.")
else:
    print("There is a discrepancy in the balance sheet.")
    